[med-svn] [falcon] 01/04: Imported Upstream version 1.8.6

Afif Elghraoui afif at moszumanska.debian.org
Wed Dec 28 08:13:30 UTC 2016


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository falcon.

commit 215b42650780bb288e2f39d96ffe4b2848224609
Author: Afif Elghraoui <afif at debian.org>
Date:   Tue Dec 27 23:41:51 2016 -0800

    Imported Upstream version 1.8.6
---
 FALCON/falcon_kit/bash.py                          |   36 +-
 FALCON/falcon_kit/functional.py                    |   14 +-
 FALCON/falcon_kit/mains/LAmerge.py                 |   51 +
 FALCON/falcon_kit/mains/LAsort.py                  |   46 +
 FALCON/falcon_kit/mains/run1.py                    |   14 +-
 FALCON/falcon_kit/pype_tasks.py                    |    8 +-
 FALCON/test/test_functional.py                     |    8 +
 FALCON_unzip/LICENSE.txt                           |   32 -
 FALCON_unzip/README.md                             |    1 -
 FALCON_unzip/examples/fc_unzip.cfg                 |   17 -
 FALCON_unzip/examples/unzip.sh                     |    2 -
 FALCON_unzip/examples/unzip_obsoleted.sh           |   33 -
 FALCON_unzip/falcon_unzip/__init__.py              |   36 -
 FALCON_unzip/falcon_unzip/dedup_h_tigs.py          |   70 -
 FALCON_unzip/falcon_unzip/get_read_hctg_map.py     |  110 --
 FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py      |  678 ---------
 .../falcon_unzip/ovlp_filter_with_phase.py         |  354 -----
 FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py  | 1545 --------------------
 FALCON_unzip/falcon_unzip/phasing.py               |  570 --------
 FALCON_unzip/falcon_unzip/phasing_readmap.py       |   73 -
 FALCON_unzip/falcon_unzip/rr_hctg_track.py         |  201 ---
 FALCON_unzip/falcon_unzip/run_quiver.py            |  394 -----
 FALCON_unzip/falcon_unzip/select_reads_from_bam.py |  113 --
 FALCON_unzip/falcon_unzip/unzip.py                 |  357 -----
 FALCON_unzip/setup.py                              |   24 -
 FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py     |    4 -
 .../src/py_scripts/fc_get_read_hctg_map.py         |    4 -
 FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py |    4 -
 .../src/py_scripts/fc_ovlp_filter_with_phase.py    |    6 -
 .../src/py_scripts/fc_phased_ovlp_to_graph.py      |    5 -
 FALCON_unzip/src/py_scripts/fc_phasing.py          |    5 -
 FALCON_unzip/src/py_scripts/fc_phasing_readmap.py  |    4 -
 FALCON_unzip/src/py_scripts/fc_quiver.py           |    4 -
 FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py    |    4 -
 .../src/py_scripts/fc_select_reads_from_bam.py     |    4 -
 .../src/py_scripts/fc_track_reads_htigs0.py        |  398 -----
 FALCON_unzip/src/py_scripts/fc_unzip.py            |    4 -
 FALCON_unzip/src/py_utils/align.sh                 |   10 -
 FALCON_unzip/src/py_utils/call_SV_from_mum.py      |   85 --
 FALCON_unzip/src/py_utils/fetch_read.py            |   12 -
 FALCON_unzip/src/py_utils/split_bam.py             |   28 -
 pypeFLOW/pwatcher/fs_based.py                      |   30 +-
 pypeFLOW/pypeflow/do_task.py                       |    4 +
 pypeFLOW/pypeflow/simple_pwatcher_bridge.py        |   59 +-
 44 files changed, 228 insertions(+), 5233 deletions(-)

diff --git a/FALCON/falcon_kit/bash.py b/FALCON/falcon_kit/bash.py
index 1f64a96..e0880e8 100644
--- a/FALCON/falcon_kit/bash.py
+++ b/FALCON/falcon_kit/bash.py
@@ -276,17 +276,43 @@ def scripts_merge(config, db_prefix, run_jobs_fn):
     """
     with open(run_jobs_fn) as f:
         mjob_data = functional.get_mjob_data(f)
+    bash_funcs = """
+# Get the real path of a link, following all links (since 'readlink -f' is not on OSX).
+# http://stackoverflow.com/questions/7665/how-to-resolve-symbolic-links-in-a-shell-script/697552
+realpath() {
+    local r=$1; local t=$(readlink $r)
+    while [ $t ]; do
+        r=$(cd $(dirname $r) && cd $(dirname $t) && pwd -P)/$(basename $t)
+        t=$(readlink $r)
+    done
+    echo $r
+}
+rmfollow() {
+    local path=$(realpath $1)
+    rm -f "${path}"
+}
+"""
     #las_fns = functional.get_las_filenames(mjob_data, db_prefix) # ok, but tricky
     for p_id in mjob_data:
+        #las_fn = las_fns[p_id]
+        # We already know the output .las filename by convention.
+        las_fn = '%s.%s.las' % (db_prefix, p_id)
+
         bash_lines = mjob_data[p_id]
 
         script = []
         for line in bash_lines:
-            script.append(line.replace('&&', ';'))
-        #las_fn = las_fns[p_id]
-        # We already know the output .las filename by convention.
-        las_fn = '%s.%s.las' % (db_prefix, p_id)
-        yield p_id, '\n'.join(script + ['']), las_fn
+            script.append(line) # Assume we no longer have "&& rm".
+        for line in bash_lines:
+            if not line.startswith('LAmerge'):
+                continue
+            las_files = [word + '.las' for word in functional.yield_args_from_line(line)]
+            assert las_fn == os.path.basename(las_files[0])
+            script.extend('rmfollow {}'.format(fn) for fn in las_files[1:])
+            break
+
+        content = bash_funcs + '\n'.join(script + [''])
+        yield p_id, content, las_fn
 
 def script_run_consensus(config, db_fn, las_fn, out_file_bfn):
     """config: falcon_sense_option, length_cutoff
diff --git a/FALCON/falcon_kit/functional.py b/FALCON/falcon_kit/functional.py
index a799b4d..b9495d5 100644
--- a/FALCON/falcon_kit/functional.py
+++ b/FALCON/falcon_kit/functional.py
@@ -33,7 +33,10 @@ def get_daligner_job_descriptions_sans_LAcheck(run_jobs_stream, db_prefix, singl
     descs = get_daligner_job_descriptions(run_jobs_stream, db_prefix, single)
     result = {}
     for k,v in descs.iteritems():
-        result[k] = skip_LAcheck(v)
+        bash = skip_LAcheck(v)
+        bash = bash.replace('LAsort', 'python -m falcon_kit.mains.LAsort {}'.format(db_prefix))
+        bash = bash.replace('LAmerge', 'python -m falcon_kit.mains.LAmerge {}'.format(db_prefix))
+        result[k] = bash
     return result
 
 def get_daligner_job_descriptions(run_jobs_stream, db_prefix, single=False):
@@ -208,6 +211,15 @@ def get_mjob_data(run_jobs_stream):
             mjob_data[p_id].append(l)
     return mjob_data
 
+def yield_args_from_line(bash_line):
+    """Given a line of LAmerge, etc.,
+    return [output_las_fn, input_las_fn0, input_las_fn1, ...]
+    """
+    for word in bash_line.split():
+        if word.startswith('-') or word in ('LAcheck', 'LAmerge', 'LAsort'):
+            continue
+        yield word
+
 _re_sub_daligner = re.compile(r'^daligner\b', re.MULTILINE)
 def xform_script_for_preads(script):
     daligner_exe = 'daligner_p'
diff --git a/FALCON/falcon_kit/mains/LAmerge.py b/FALCON/falcon_kit/mains/LAmerge.py
new file mode 100644
index 0000000..762ebc0
--- /dev/null
+++ b/FALCON/falcon_kit/mains/LAmerge.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+"""Usage:
+
+    LAmerge.py DB <args>
+
+Run LAcheck on each input in args. Exclude any failures from
+the arglist. Then run LAmerge on the remaining arglist.
+
+This differs from LAsort.py in that the first las file is actually
+an *explicit* output, whereas LAsort relies on *implicit* outputs.
+"""
+import sys, os
+
+def log(msg):
+    sys.stderr.write(msg + '\n')
+
+def system(call, checked=False):
+    log('!{}'.format(call))
+    rc = os.system(call)
+    if rc:
+        msg = '{} <- {!r}'.format(rc, call)
+        if checked:
+            raise Exception(msg)
+        log(msg)
+    return rc
+
+def main(argv=sys.argv):
+    db = argv[1]
+    args = argv[2:] # Skip program name
+    lass = list()
+    new_args = list()
+    new_args.append('LAmerge')
+    for arg in args:
+        if arg.startswith('-'):
+            new_args.append(arg)
+        else:
+            lass.append(arg)
+    outlas = lass[0]
+    new_args.append(outlas) # This is the output las.
+    for las in lass[1:]:
+        rc = system('LAcheck -vS {} {}.las'.format(db, las)) # Assume sorted.
+        if rc:
+            log('Skipping {}.las'.format(las))
+        else:
+            new_args.append(las)
+    system(' '.join(new_args))
+    system('LAcheck -vS {} {}.las'.format(db, outlas)) # Assume sorted.
+
+
+if __name__ == "__main__":
+    main()
diff --git a/FALCON/falcon_kit/mains/LAsort.py b/FALCON/falcon_kit/mains/LAsort.py
new file mode 100644
index 0000000..387bb47
--- /dev/null
+++ b/FALCON/falcon_kit/mains/LAsort.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+"""Usage:
+
+    LAsort.py DB <args>
+
+Run LAcheck on each input in args. Exclude any failures from
+the arglist. Then run LAsort on the remaining arglist.
+"""
+import sys, os
+
+def log(msg):
+    sys.stderr.write(msg + '\n')
+
+def system(call, checked=False):
+    log('!{}'.format(call))
+    rc = os.system(call)
+    if rc:
+        msg = '{} <- {!r}'.format(rc, call)
+        if checked:
+            raise Exception(msg)
+        log(msg)
+    return rc
+
+def main(argv=sys.argv):
+    log('argv:{!r}'.format(argv))
+    db = argv[1]
+    args = argv[2:] # Skip program name
+    lass = list()
+    new_args = list()
+    new_args.append('LAsort')
+    for arg in args:
+        if arg.startswith('-'):
+            new_args.append(arg)
+        else:
+            lass.append(arg)
+    for las in lass:
+        rc = system('LAcheck -v {} {}.las'.format(db, las))
+        if rc:
+            log('Skipping {}.las'.format(las))
+        else:
+            new_args.append(las)
+    system(' '.join(new_args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/FALCON/falcon_kit/mains/run1.py b/FALCON/falcon_kit/mains/run1.py
index 2e5f99d..9b7bee6 100644
--- a/FALCON/falcon_kit/mains/run1.py
+++ b/FALCON/falcon_kit/mains/run1.py
@@ -121,12 +121,15 @@ def main1(prog_name, input_config_fn, logger_config_fn=None):
         fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn))
         raise
     input_fofn_plf = makePypeLocalFile(config['input_fofn'])
+    genome_size = config.get('genome_size')
+    squash = True if 0 < genome_size < 1000000 else False
     wf = PypeProcWatcherWorkflow(job_type=config['job_type'],
             job_queue=config['job_queue'],
             sge_option=config.get('sge_option', ''),
             watcher_type=config['pwatcher_type'],
             watcher_directory=config['pwatcher_directory'],
             use_tmpdir=config.get('use_tmpdir'),
+            squash=squash
     )
     run(wf, config,
             os.path.abspath(input_config_fn),
@@ -259,6 +262,9 @@ def run(wf, config,
             sys.exit(0)
 
         # Produce new FOFN of preads fasta, based on consensus of overlaps.
+        concurrent_jobs = config['cns_concurrent_jobs']
+        wf.max_jobs = concurrent_jobs
+
         scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json')
         make_task = PypeTask(
                 inputs = {
@@ -298,8 +304,6 @@ def run(wf, config,
         task = make_task(pype_tasks.task_report_pre_assembly)
         wf.addTask(task)
 
-        concurrent_jobs = config['cns_concurrent_jobs']
-        wf.max_jobs = concurrent_jobs
         wf.refreshTargets(exitOnFailure=exitOnFailure)
 
 
@@ -341,6 +345,9 @@ def run(wf, config,
 
     preads_nblock = support.get_nblock(fn(preads_db))
     #### run daligner
+    concurrent_jobs = config['ovlp_concurrent_jobs']
+    wf.max_jobs = concurrent_jobs
+
     config['sge_option_da'] = config['sge_option_pda']
 
     scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json')
@@ -404,9 +411,6 @@ def run(wf, config,
     task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las)
     wf.addTask(task)
 
-    concurrent_jobs = config['ovlp_concurrent_jobs']
-    wf.max_jobs = concurrent_jobs
-
     wf.refreshTargets(exitOnFailure=exitOnFailure)
 
 
diff --git a/FALCON/falcon_kit/pype_tasks.py b/FALCON/falcon_kit/pype_tasks.py
index 169bf5e..4219388 100644
--- a/FALCON/falcon_kit/pype_tasks.py
+++ b/FALCON/falcon_kit/pype_tasks.py
@@ -195,7 +195,11 @@ def task_run_las_merge(self):
     gathered_dict = read_gathered_las(gathered_las_fn)
     las_paths = gathered_dict[job_id]
     for las_path in las_paths:
-        src = os.path.relpath(las_path, cwd)
+        assert os.path.isabs(las_path)
+        if os.path.commonprefix([las_path, cwd]) == '/':
+            src = las_path
+        else:
+            src = os.path.relpath(las_path, cwd)
         tgt = os.path.join(cwd, os.path.basename(las_path))
         LOG.debug('symlink {!r} -> {!r}'.format(src, tgt))
         os.symlink(src, tgt)
@@ -252,7 +256,7 @@ def task_daligner_scatter(self):
     LOG.info('Skip LAcheck after daligner? {}'.format(skip_checks))
     func = task_run_daligner
     func_name = '{}.{}'.format(func.__module__, func.__name__)
-    for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, db_build_done, nblock, pread_aln, skip_checks):
+    for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, db_build_done, nblock, pread_aln, skip_check=skip_checks):
         job_done_fn = 'job_%s_done' %job_uid
         parameters =  {'daligner_script': script,
                        'job_uid': job_uid,
diff --git a/FALCON/test/test_functional.py b/FALCON/test/test_functional.py
index aafb6d4..bbcacaf 100644
--- a/FALCON/test/test_functional.py
+++ b/FALCON/test/test_functional.py
@@ -200,3 +200,11 @@ sample_perl_counts_output = """
 def test_calc_metric_fragmentation():
     frag = f.calc_metric_fragmentation(sample_perl_counts_output)
     eq_(frag, 2.5)
+
+def test_args_from_line():
+    line = 'LAmerge -v preads.1 preads.1.preads.1.C0.S preads.1.preads.1.N0.S'
+    expected = ['preads.1', 'preads.1.preads.1.C0.S', 'preads.1.preads.1.N0.S']
+    result = list(f.yield_args_from_line(line))
+    helpers.equal_list(result, expected)
+    bash_lines = [line]
+    las_files = [word + '.las' for word in f.yield_args_from_line(line) for line in bash_lines if line.startswith('LAmerge')][1:]
diff --git a/FALCON_unzip/LICENSE.txt b/FALCON_unzip/LICENSE.txt
deleted file mode 100644
index 191f795..0000000
--- a/FALCON_unzip/LICENSE.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-The Clear BSD License
-
-Copyright (c) 2016, Pacific Biosciences
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted (subject to the limitations in the disclaimer
-below) provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its contributors may be used
-  to endorse or promote products derived from this software without specific
-  prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS
-LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGE.
diff --git a/FALCON_unzip/README.md b/FALCON_unzip/README.md
deleted file mode 100644
index fd5191c..0000000
--- a/FALCON_unzip/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Please refer to https://github.com/PacificBiosciences/FALCON_unzip/wiki
diff --git a/FALCON_unzip/examples/fc_unzip.cfg b/FALCON_unzip/examples/fc_unzip.cfg
deleted file mode 100644
index 787360c..0000000
--- a/FALCON_unzip/examples/fc_unzip.cfg
+++ /dev/null
@@ -1,17 +0,0 @@
-[General]
-job_type = SGE
-
-[Unzip]
-input_fofn= input.fofn
-input_bam_fofn= input_bam.fofn
-
-smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
-
-jobqueue = your_sge_queue
-sge_phasing= -pe smp 12 -q %(jobqueue)s
-sge_quiver= -pe smp 12 -q %(jobqueue)s
-sge_track_reads= -pe smp 12 -q %(jobqueue)s
-sge_blasr_aln=  -pe smp 24 -q %(jobqueue)s
-sge_hasm=  -pe smp 48 -q %(jobqueue)s
-unzip_concurrent_jobs = 64
-quiver_concurrent_jobs = 64
diff --git a/FALCON_unzip/examples/unzip.sh b/FALCON_unzip/examples/unzip.sh
deleted file mode 100644
index 529e368..0000000
--- a/FALCON_unzip/examples/unzip.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-fc_unzip.py fc_unzip.cfg
-fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/examples/unzip_obsoleted.sh b/FALCON_unzip/examples/unzip_obsoleted.sh
deleted file mode 100644
index 88d22da..0000000
--- a/FALCON_unzip/examples/unzip_obsoleted.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# identify raw reads to each contig
-fc_track_reads.py
-# get the reads assigned to each contig from the initial *.subreads.fasta files
-mkdir -p 3-unzip/reads/
-fc_fetch_reads.py
-# phasing the read 
-fc_unzip.py fc_unzip.cfg
-# doing the haplotig assembly, this part will be intergrated into fc_unzip.py
-mkdir -p 3-unzip/1-hasm/
-cd 3-unzip/1-hasm/
-find ../0-phasing/ -name "rid_to_phase.*" | xargs cat  > rid_to_phase.all
-fc_ovlp_filter_with_phase.py --fofn ../../2-asm-falcon/las.fofn --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map ./rid_to_phase.all > preads.p_ovl
-fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
-fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map ./rid_to_phase.all --fasta ../../1-preads_ovl/preads4falcon.fasta
-WD=$PWD
-for f in `cat ../reads/ctg_list `;do cd $WD/$f; fc_dedup_h_tigs.py $f; done
-# prepare for quviering the haplotig
-cd $WD/..
-rm all_phased_reads all_h_ctg_ids all_h_ctg_edges all_p_ctg_edges all_p_ctg.fa all_h_ctg.fa
-find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
-find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
-find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
-find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
-find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
-find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
-cd ../
-# identify raw reads to each primary contig or haplotig
-fc_track_reads_htigs.py
-# decompose PacBio raw reads with pulse information in bam files into individual bam file for each haplotig or primary contig
-mkdir -p 4-quiver/reads/
-fc_select_reads_from_bam.py input_bam.fofn
-# run blasr and quiver to generate consensus for each haplotig or primary contig
-fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/falcon_unzip/__init__.py b/FALCON_unzip/falcon_unzip/__init__.py
deleted file mode 100644
index 94e4fd5..0000000
--- a/FALCON_unzip/falcon_unzip/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#################################################################################$$
-# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
diff --git a/FALCON_unzip/falcon_unzip/dedup_h_tigs.py b/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
deleted file mode 100644
index 61bd8c3..0000000
--- a/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import os
-import sys
-from falcon_kit.FastaReader import FastaReader
-
-def main(args):
-
-    ctg_id = sys.argv[1]
-
-    if os.path.exists("h_ctg_all.{ctg_id}.fa".format(ctg_id = ctg_id)):
-        os.system("nucmer -mum p_ctg.{ctg_id}.fa h_ctg_all.{ctg_id}.fa -p hp_aln".format(ctg_id = ctg_id))
-        os.system("show-coords -T -H -l -c hp_aln.delta > hp_aln.coor")
-    else:
-        sys.exit(0) #it is ok if there is no h_ctg_all.{ctg_id}.fa, don't want to interupt the workflow
-
-
-    if os.path.exists("hp_aln.coor"):
-        filter_out = set()
-        with open("hp_aln.coor") as f:
-            for row in f:
-                row = row.strip().split()
-                q_cov = float(row[10])
-                idt = float(row[6])
-                if q_cov > 99 and idt > 99:
-                    filter_out.add(row[-1])
-
-        p_ctg_to_phase = {}
-        with open("p_ctg_path.%s" % ctg_id) as f:
-            for row in f:
-                row = row.strip().split()
-                b_id, ph_id = (int(row[-2]), int(row[-1]) )
-                p_ctg_to_phase.setdefault( row[0], {} )
-                p_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
-                p_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
-
-
-        h_ctg_to_phase = {}
-        with open("h_ctg_path.%s" % ctg_id) as f:
-            for row in f:
-                row = row.strip().split()
-                b_id, ph_id = (int(row[-2]), int(row[-1]) )
-                h_ctg_to_phase.setdefault( row[0], {} )
-                h_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
-                h_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
-
-        h_ids = open("h_ctg_ids.%s" % ctg_id,"w")
-        with open("h_ctg.%s.fa" % ctg_id, "w") as f:
-            h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id)
-            for r in h_tig_all:
-                p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {})
-
-                if len(r.sequence) < 500:
-                    continue
-
-                if r.name in filter_out:
-                    edge_count = sum(h_ctg_to_phase[ r.name ].values())
-                    same_phase_to_p_ctg_count = 0
-                    for  b_id, ph_id in h_ctg_to_phase[ r.name ]:
-                        if b_id != -1:
-                            if (b_id, ph_id) in p_ctg_phase:
-                                same_phase_to_p_ctg_count += h_ctg_to_phase[ r.name ][ (b_id, ph_id) ]
-                    unphased_edge_count = h_ctg_to_phase[ r.name ] .get( (-1, 0), 0 )
-
-                    print r.name, edge_count, unphased_edge_count, same_phase_to_p_ctg_count
-                    if edge_count - unphased_edge_count - same_phase_to_p_ctg_count < 5: # there are many non-p_ctg phase segment, do not filter out
-                        continue
-
-                print >>f, ">"+r.name
-                print >>f, r.sequence
-                print >> h_ids, r.name
-        h_ids.close()
diff --git a/FALCON_unzip/falcon_unzip/get_read_hctg_map.py b/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
deleted file mode 100644
index 18909f2..0000000
--- a/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
-        makePypeLocalFile, fn, PypeTask)
-import argparse
-import logging
-import os
-import sys
-
-def make_dirs(d):
-    if not os.path.isdir(d):
-        os.makedirs(d)
-
-def generate_read_to_hctg_map(self):
-    rawread_id_file = fn( self.rawread_id_file )
-    pread_id_file = fn( self.pread_id_file )
-    read_to_contig_map = fn( self.read_to_contig_map )
-
-    pread_did_to_rid = open(pread_id_file).read().split('\n')
-    rid_to_oid = open(rawread_id_file).read().split('\n')
-
-    h_ctg_edges = fn( self.h_ctg_edges )
-    p_ctg_edges = fn( self.p_ctg_edges )
-
-    h_ctg_ids = set()
-    with open(fn(self.h_ctg_ids)) as f:
-        for row in f:
-            row = row.strip()
-            h_ctg_ids.add( row )
-
-    pread_to_contigs = {}
-
-    for fnanme in ( p_ctg_edges, h_ctg_edges):
-        with open(fnanme) as f:
-            for row in f:
-                row = row.strip().split()
-                ctg = row[0]
-                if len(ctg.split('_')) > 1 and ctg not in h_ctg_ids:
-                    continue
-                n1 = row[1]
-                n2 = row[2]
-                pid1 = int(n1.split(':')[0])
-                pid2 = int(n2.split(':')[0])
-                rid1 = pread_did_to_rid[pid1].split('/')[1]
-                rid2 = pread_did_to_rid[pid2].split('/')[1]
-                rid1 = int(int(rid1)/10)
-                rid2 = int(int(rid2)/10)
-                oid1 = rid_to_oid[rid1]
-                oid2 = rid_to_oid[rid2]
-                k1 = (pid1, rid1, oid1)
-                pread_to_contigs.setdefault( k1, set() )
-                pread_to_contigs[ k1 ].add( ctg )
-                k2 = (pid2, rid2, oid2)
-                pread_to_contigs.setdefault( k2, set() )
-                pread_to_contigs[ k2 ].add( ctg )
-
-    with open(read_to_contig_map, 'w') as f:
-        for k in pread_to_contigs:
-            pid, rid, oid = k
-            for ctg in list(pread_to_contigs[ k ]):
-                print >>f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
-
-def get_read_hctg_map(asm_dir, hasm_dir, quiver_dir):
-    wf = PypeProcWatcherWorkflow(
-            max_jobs=12, # TODO: Why was NumThreads ever set? There is only one task!
-    )
-
-    rawread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids'))
-    pread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids'))
-    h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges'))
-    p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges'))
-    h_ctg_ids = makePypeLocalFile( os.path.join(hasm_dir, "all_h_ctg_ids"))
-    read_map_dir = os.path.join(quiver_dir, 'read_maps')
-    make_dirs(read_map_dir)
-
-    read_to_contig_map = makePypeLocalFile(os.path.join(read_map_dir, 'read_to_contig_map'))
-
-    inputs = { 'rawread_id_file': rawread_id_file,
-               'pread_id_file': pread_id_file,
-               'h_ctg_edges': h_ctg_edges,
-               'p_ctg_edges': p_ctg_edges,
-               'h_ctg_ids': h_ctg_ids}
-
-    make_task = PypeTask(
-               inputs = inputs,
-               outputs = {'read_to_contig_map': read_to_contig_map},
-    )
-    wf.addTask(make_task(generate_read_to_hctg_map))
-    wf.refreshTargets() # block
-
-
-
-def parse_args(argv):
-    parser = argparse.ArgumentParser(description='generate `4-quiver/read_maps/read_to_contig_map` that contains the \
-information from the chain of mapping: (contig id, last col) -> (internal p-read id) -> (internal raw-read id) -> (original read id)\n \
-it assumes the 2-asm-falcon/read_maps ... /dump_rawread_ids/rawread_ids and /dump_pread_ids/pread_ids are already generated',
-              formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--basedir', type=str, default="./", help='the base working dir of a FALCON assembly')
-    args = parser.parse_args(argv[1:])
-    return args
-
-def main(argv=sys.argv):
-    logging.basicConfig()
-    args = parse_args(argv)
-    basedir = args.basedir
-    #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
-    #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
-    asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
-    hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
-    quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
-
-    get_read_hctg_map(asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py b/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
deleted file mode 100644
index fe7fd00..0000000
--- a/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
+++ /dev/null
@@ -1,678 +0,0 @@
-from falcon_kit.fc_asm_graph import AsmGraph
-from falcon_kit.FastaReader import FastaReader
-import os
-import networkx as nx
-from multiprocessing import Pool
-import argparse
-import re
-import sys
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-## for shared memory usage
-global p_asm_G
-global h_asm_G
-global all_rid_to_phase
-global seqs
-
-def mkdir(d):
-    if not os.path.isdir(d):
-        os.makedirs(d)
-
-def reverse_end( node_id ):
-    node_id, end = node_id.split(":")
-    new_end = "B" if end == "E" else "E"
-    return node_id + ":" + new_end
-
-
-def load_sg_seq(all_read_ids, fasta_fn):
-
-    seqs = {}
-    # load all p-read name into memory
-    f = FastaReader(fasta_fn)
-    for r in f:
-        if r.name not in all_read_ids:
-            continue
-        seqs[r.name] = r.sequence.upper()
-    return seqs
-
-def generate_haplotigs_for_ctg(input_):
-
-    ctg_id, out_dir = input_
-    global p_asm_G
-    global h_asm_G
-    global all_rid_to_phase
-    global seqs
-    arid_to_phase = all_rid_to_phase[ctg_id]
-
-    mkdir( out_dir )
-
-    ctg_G = p_asm_G.get_sg_for_ctg(ctg_id)
-
-    ctg_nodes = set(ctg_G.nodes())
-
-    sg = nx.DiGraph()
-
-    for v, w in ctg_G.edges():
-
-        vrid = v[:9]
-        wrid = w[:9]
-
-        edge_data = p_asm_G.sg_edges[ (v, w) ]
-        if edge_data[-1] != "G":
-            continue
-
-        vphase = arid_to_phase.get(vrid, (-1,0))
-        wphase = arid_to_phase.get(wrid, (-1,0))
-        if vphase[0] == wphase[0] and vphase[1] != wphase[1]:
-            cross_phase = "Y"
-        else:
-            cross_phase = "N"
-
-        sg.add_node( v, label= "%d_%d" % vphase,
-                        phase="%d_%d" % vphase,
-                        src="P" )
-
-        sg.add_node( w, label= "%d_%d" % wphase,
-                        phase="%d_%d" % wphase,
-                        src="P" )
-
-        sg.add_edge(v, w, src="OP", cross_phase = cross_phase)
-
-        # we need to add the complimentary edges as the ctg_graph does not contain the dual edges
-        rv = reverse_end(v)
-        rw = reverse_end(w)
-        sg.add_node( rv, label= "%d_%d" % vphase,
-                         phase="%d_%d" % vphase,
-                         src="P" )
-
-        sg.add_node( rw, label= "%d_%d" % wphase,
-                         phase="%d_%d" % wphase,
-                         src="P" )
-
-        sg.add_edge(rw, rv, src="OP", cross_phase = cross_phase)
-
-    PG_nodes = set(sg.nodes())
-    PG_edges = set(sg.edges())
-
-    for v, w in h_asm_G.sg_edges:
-
-        vrid = v[:9]
-        wrid = w[:9]
-
-        if vrid not in arid_to_phase:
-            continue
-        if wrid not in arid_to_phase:
-            continue
-
-        if (v, w) in PG_edges:
-            if p_asm_G.sg_edges[(v,w)][-1] == "G":
-                continue
-
-        edge_data = h_asm_G.sg_edges[ (v, w) ]
-
-        if edge_data[-1] != "G":
-            continue
-
-        cross_phase = "N"
-        if v not in PG_nodes:
-            sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
-                            phase="%d_%d" % arid_to_phase[vrid],
-                            src="H" )
-
-        if w not in PG_nodes:
-            sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
-                            phase="%d_%d" % arid_to_phase[wrid],
-                            src="H" )
-
-        sg.add_edge(v, w, src="H", cross_phase = cross_phase)
-
-        rv = reverse_end(v)
-        rw = reverse_end(w)
-        if rv not in PG_nodes:
-            sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
-                             phase="%d_%d" % arid_to_phase[vrid],
-                             src="H" )
-
-        if rw not in PG_nodes:
-            sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
-                             phase="%d_%d" % arid_to_phase[wrid],
-                             src="H" )
-
-        sg.add_edge(rw, rv, src="H", cross_phase = cross_phase)
-
-    sg0 = sg.copy()
-    for v, w in h_asm_G.sg_edges:
-        vrid = v[:9]
-        wrid = w[:9]
-        if vrid not in arid_to_phase:
-            continue
-        if wrid not in arid_to_phase:
-            continue
-
-        if (v, w) in PG_edges:
-            if p_asm_G.sg_edges[(v,w)][-1] == "G":
-                continue
-
-        edge_data = h_asm_G.sg_edges[ (v, w) ]
-
-        if sg0.in_degree(w) == 0:
-            cross_phase = "Y"
-            if v not in PG_nodes:
-                sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
-                                phase="%d_%d" % arid_to_phase[vrid],
-                                src="H" )
-
-            if w not in PG_nodes:
-                sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
-                                phase="%d_%d" % arid_to_phase[wrid],
-                                src="H" )
-
-            sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
-
-            rv = reverse_end(v)
-            rw = reverse_end(w)
-            if rv not in PG_nodes:
-                sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
-                                 phase="%d_%d" % arid_to_phase[vrid],
-                                 src="H" )
-
-            if rw not in PG_nodes:
-                sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
-                                 phase="%d_%d" % arid_to_phase[wrid],
-                                 src="H" )
-
-            sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
-
-        if sg0.out_degree(v) == 0:
-            cross_phase = "Y"
-            if v not in PG_nodes:
-                sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
-                                phase="%d_%d" % arid_to_phase[vrid],
-                                src="H" )
-
-            if w not in PG_nodes:
-                sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
-                                phase="%d_%d" % arid_to_phase[wrid],
-                                src="H" )
-
-            sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
-
-            rv = reverse_end(v)
-            rw = reverse_end(w)
-            if rv not in PG_nodes:
-                sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
-                                 phase="%d_%d" % arid_to_phase[vrid],
-                                 src="H" )
-
-            if rw not in PG_nodes:
-                sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
-                                 phase="%d_%d" % arid_to_phase[wrid],
-                                 src="H" )
-
-            sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
-
-    sg2 = sg.copy()
-    ctg_nodes_r = set([ reverse_end(v) for v in list(ctg_nodes) ])
-    for v, w in ctg_G.edges():
-        sg2.remove_edge(v, w)
-        rv, rw = reverse_end(v), reverse_end(w)
-        sg2.remove_edge(rw, rv)
-    for v in sg2.nodes():
-        if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
-            sg2.remove_node(v)
-
-    nodes_to_remove = set()
-    edges_to_remove = set()
-    for sub_g in nx.weakly_connected_component_subgraphs(sg2):
-        sub_g_nodes = set(sub_g.nodes())
-        if len(sub_g_nodes & ctg_nodes_r) > 0 and len(sub_g_nodes & ctg_nodes) > 0:
-            # remove cross edge
-            sources = [n for n in sub_g.nodes() if sub_g.in_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
-            sinks = [n for n in sub_g.nodes() if sub_g.out_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
-            edges_to_keep = set()
-            for v in sources:
-                for w in sinks:
-                    path = []
-                    if v in ctg_nodes and w not in ctg_nodes_r:
-                        try:
-                            path = nx.shortest_path( sub_g, v, w )
-                        except nx.exception.NetworkXNoPath:
-                            path = []
-                    elif v not in ctg_nodes and w in ctg_nodes_r:
-                        try:
-                            path = nx.shortest_path( sub_g, v, w )
-                        except nx.exception.NetworkXNoPath:
-                            path = []
-
-                    if len(path) >= 2:
-                        v1 = path[0]
-                        for w1 in path[1:]:
-                            edges_to_keep.add( (v1, w1) )
-                            rv1, rw1 = reverse_end(v1), reverse_end(w1)
-                            edges_to_keep.add( (rw1, rv1) )
-                            v1 = w1
-            for v, w in sub_g.edges():
-                if (v, w) not in edges_to_keep:
-                    edges_to_remove.add( (v, w) )
-                    rv, rw = reverse_end(v), reverse_end(w)
-                    edges_to_remove.add( (rw, rv) )
-
-
-        if len(sub_g_nodes & ctg_nodes_r) == 0 and len(sub_g_nodes & ctg_nodes) == 0:
-            nodes_to_remove.update( sub_g_nodes )
-            nodes_to_remove.update( set( [reverse_end(v) for v in list(sub_g_nodes)] ) )
-
-    for v, w in list(edges_to_remove):
-        sg.remove_edge(v, w)
-
-    for v in nodes_to_remove:
-        sg.remove_node(v)
-
-    for v in sg.nodes():
-        if sg.out_degree(v) == 0 and sg.in_degree(v) == 0:
-            sg.remove_node(v)
-
-    #nx.write_gexf(sg, "full_g.gexf")
-
-    s_node = p_asm_G.ctg_data[ctg_id][5][0][0]
-    t_node = p_asm_G.ctg_data[ctg_id][5][-1][-1]
-
-    for v, w in sg.edges():
-        phase0 = sg.node[v]["phase"].split("_")
-        phase1 = sg.node[w]["phase"].split("_")
-        if phase0 == phase1:
-            sg[v][w]["weight"] = 10
-            sg[v][w]["score"] = 1
-            sg[v][w]["label"] = "type0"
-        else:
-            if phase0[0] == phase1[0]:
-                sg[v][w]["weight"] = 1
-                sg[v][w]["score"] = 100000
-                sg[v][w]["label"] = "type1"
-            else:
-                sg[v][w]["weight"] = 5
-                sg[v][w]["score"] = 50
-                sg[v][w]["label"] = "type2"
-
-
-    sg2 = sg.copy()
-    edge_to_remove = set()
-    for v, w in sg2.edges():
-        if sg2[v][w]["src"] == "ext":
-            edge_to_remove.add( (v, w) )
-            rv, rw = reverse_end(v), reverse_end(w)
-            edge_to_remove.add( (rw, rv) )
-
-        if sg2.node[v]["phase"] ==  sg2.node[w]["phase"]:
-            continue
-        flag1 = 0
-        flag2 = 0
-        for e in sg2.out_edges(v):
-            if sg2.node[e[0]]["phase"] ==  sg2.node[e[1]]["phase"]:
-                flag1 = 1
-                break
-        if flag1 == 1:
-            for e in sg2.in_edges(w):
-                if sg2.node[e[0]]["phase"] ==  sg2.node[e[1]]["phase"]:
-                    flag2 = 1
-                    break
-        if flag2 == 1:
-            edge_to_remove.add( (v, w) )
-            rv, rw = reverse_end(v), reverse_end(w)
-            edge_to_remove.add( (rw, rv) )
-
-    for v, w in list(edge_to_remove):
-        sg2.remove_edge(v, w)
-
-    ## remove some new short cut due to repeats in the haplotype edges
-    edge_to_remove = set()
-    with open(os.path.join(out_dir, "path_len.%s" % ctg_id), "w") as f:
-
-        all_length_from_s = nx.shortest_path_length(sg, source=s_node)
-        edge_to_remove = set()
-
-        for w in all_length_from_s:
-            longest = 0
-            if sg.in_degree(w) >= 2:
-                for e in sg.in_edges(w):
-                    v = e[0]
-                    if v in all_length_from_s:
-                        len_ = all_length_from_s[v]
-                        if len_ > longest:
-                            longest = len_
-                if longest == 0:
-                    continue
-                for e in sg.in_edges(w):
-                    v = e[0]
-                    if v in all_length_from_s:
-                        len_ = all_length_from_s[v]
-                        print >>f, ctg_id, "link_lengths", v, w, longest, len_
-                        if longest - len_ > 10 and sg[v][w]["src"] != "OP":  #make sure we always have one path drived from the original graph from s to t
-                            edge_to_remove.add( (v, w) )
-                            rv, rw = reverse_end(v), reverse_end(w)
-                            edge_to_remove.add( (rw, rv) )
-
-        for v, w in list(edge_to_remove):
-            sg.remove_edge(v, w)
-            print >>f, ctg_id, "removed", v, w
-
-        for v, w in list(edge_to_remove):
-            try:
-                sg2.remove_edge(v, w)
-            except Exception:
-                pass
-
-
-    nx.write_gexf(sg, os.path.join(out_dir, "sg.gexf" ))
-    nx.write_gexf(sg2, os.path.join(out_dir, "sg2.gexf" ))
-
-
-    try:
-        s_path = nx.shortest_path(sg2, source=s_node, target=t_node, weight="score")
-    except nx.exception.NetworkXNoPath:
-        s_path = nx.shortest_path(sg, source=s_node, target=t_node, weight="score")
-
-    s_path_edges = []
-    for i in xrange(len(s_path)-1):
-        v = s_path[i]
-        w = s_path[i+1]
-        sg[v][w]["weight"] = 15
-        s_path_edges.append( (v,w) )
-
-    s_path_edge_set = set(s_path_edges)
-
-
-
-    #output the updated primary contig
-    p_tig_path = open(os.path.join(out_dir, "p_ctg_path.%s" % ctg_id),"w")
-    p_tig_fa = open(os.path.join(out_dir, "p_ctg.%s.fa" % ctg_id),"w")
-    edges_to_remove1 = set()
-    edges_to_remove2 = set()
-    with open(os.path.join(out_dir, "p_ctg_edges.%s" % ctg_id), "w") as f:
-        seq = []
-        for v, w in s_path_edges:
-            sg[v][w]["h_edge"] = 1
-            vrid = v.split(":")[0]
-            wrid = w.split(":")[0]
-            vphase = arid_to_phase.get(vrid, (-1,0))
-            wphase = arid_to_phase.get(wrid, (-1,0))
-            print >>f, "%s" % ctg_id, v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
-
-            if sg.edge[v][w]["src"] == "OP":
-                edge_data = p_asm_G.sg_edges[ (v,w) ]
-            else:
-                edge_data = h_asm_G.sg_edges[ (v,w) ]
-
-            seq_id, s, t = edge_data[0]
-            if s < t:
-                seq.append(seqs[ seq_id ][ s:t ])
-            else:
-                seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ s:t:-1 ] ]))
-            print >>p_tig_path, "%s" % ctg_id, v, w, seq_id, s, t, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
-            sg[v][w]["tig_id"] = "%s" % ctg_id
-
-            rv, rw = reverse_end(v), reverse_end(w)
-            edges_to_remove1.add( (v, w) )
-            edges_to_remove2.add( (rw, rv) )
-
-        print >> p_tig_fa, ">%s" % ctg_id
-        print >> p_tig_fa, "".join(seq)
-
-    p_tig_fa.close()
-    p_tig_path.close()
-
-
-
-    sg2 = sg.copy()
-    reachable1 = nx.descendants(sg2, s_node)
-    sg2_r = sg2.reverse()
-    reachable2 = nx.descendants(sg2_r, t_node)
-
-    reachable_all = reachable1 | reachable2
-    reachable_both = reachable1 & reachable2
-
-
-    for v, w in list(edges_to_remove2 | edges_to_remove1):
-        sg2.remove_edge( v, w )
-
-    for v, w in sg2.edges():
-        if sg2[v][w]["cross_phase"] == "Y":
-            sg2.remove_edge( v, w )
-
-    for v in sg2.nodes():
-        if v not in reachable_all:
-            sg2.remove_node(v)
-
-    for v in sg2.nodes():
-        if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
-            sg2.remove_node(v)
-            continue
-        if v in reachable_both:
-            sg2.node[v]["reachable"] = 1
-        else:
-            sg2.node[v]["reachable"] = 0
-
-    dump_graph = False # the code segement below is useful for showing the graph
-    if dump_graph == True:
-        nx.write_gexf(sg2, "%s_1.gexf" % ctg_id)
-
-    p_path_nodes = set(s_path)
-    p_path_rc_nodes = set( [reverse_end(v) for v in s_path] )
-
-    sg2_nodes = set(sg2.nodes())
-    for v in p_asm_G.get_sg_for_ctg(ctg_id).nodes():
-        rv = reverse_end(v)
-        p_path_rc_nodes.add( rv )
-        if rv in sg2_nodes:
-            sg2.remove_node(rv)
-
-
-    h_tig_path = open(os.path.join(out_dir, "h_ctg_path.%s" % ctg_id),"w")
-    h_tig_fa = open(os.path.join(out_dir, "h_ctg_all.%s.fa" % ctg_id),"w")
-    edges_to_remove = set()
-
-    labelled_node = set()
-    with open(os.path.join(out_dir, "h_ctg_edges.%s" % ctg_id),"w") as f:
-        h_tig_id = 1
-        h_paths = {}
-        #print "number of components:", len([tmp for tmp in nx.weakly_connected_component_subgraphs(sg2)])
-        for sub_hg_0 in nx.weakly_connected_component_subgraphs(sg2):
-            sub_hg = sub_hg_0.copy()
-            while sub_hg.size() > 5:
-                #print "sub_hg size:", len(sub_hg.nodes())
-                sources = [n for n in sub_hg.nodes() if sub_hg.in_degree(n) != 1 ]
-                sinks = [n for n in sub_hg.nodes() if sub_hg.out_degree(n) != 1 ]
-
-
-                #print "number of sources", len(sources),  sources
-                #print "number of sinks", len(sinks), sinks
-                if len(sources) == 0 and len(sinks) == 0: #TODO, the rest of the sub-graph are circles, we need to break and print warnning message
-                    break
-
-                longest = []
-
-                eliminated_sinks = set()
-                s_longest = {}
-                for s in sources:
-                    #print "test source",s, len(eliminated_sinks)
-                    if s in labelled_node:
-                        continue
-                    s_path = []
-                    for t in sinks:
-                        if t in eliminated_sinks:
-                            continue
-                        try:
-                            path = nx.shortest_path(sub_hg, s, t, weight="score")
-                            #print "test path len:", len(path), s, t
-                        except nx.exception.NetworkXNoPath:
-                            path = []
-                            continue
-                        s_path.append( [ path, t ] )
-                    s_path.sort(key = lambda x: -len(x[0]))
-                    if len(s_path) == 0:
-                        continue
-                    s_longest[s] = s_path[0][0]
-                    if len(s_longest[s]) > len(longest):
-                        longest = s_longest[s]
-                        #print "s longest", longest[0], longest[-1], len(longest)
-                    for path, t in s_path[1:]:
-                        eliminated_sinks.add(t)
-                        #print "elimated t", t
-
-
-                if len(longest) == 0:
-                    break
-
-                s = longest[0]
-                t = longest[-1]
-                h_paths[ ( s, t ) ] = longest
-
-                labelled_node.add(s)
-                rs = reverse_end(s)
-                labelled_node.add(rs)
-
-                for v in longest:
-                    sub_hg.remove_node(v)
-
-        for s, t in h_paths:
-            longest = h_paths[ (s, t) ]
-            #print "number of node in path", s,t,len(longest)
-            seq = []
-            for v, w in zip(longest[:-1], longest[1:]):
-                sg[v][w]["h_edge"] = 1
-                if sg.edge[v][w]["src"] == "OP":
-                    edge_data = p_asm_G.sg_edges[ (v,w) ]
-                else:
-                    edge_data = h_asm_G.sg_edges[ (v,w) ]
-                vrid = v.split(":")[0]
-                wrid = w.split(":")[0]
-                vphase = arid_to_phase.get(vrid, (-1,0))
-                wphase = arid_to_phase.get(wrid, (-1,0))
-                print >>f, "%s_%03d" % (ctg_id, h_tig_id), v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
-
-                if sg.edge[v][w]["src"] == "OP":
-                    edge_data = p_asm_G.sg_edges[ (v,w) ]
-                else:
-                    edge_data = h_asm_G.sg_edges[ (v,w) ]
-
-                seq_id, sp, tp = edge_data[0]
-                if sp < tp:
-                    seq.append(seqs[ seq_id ][ sp:tp ])
-                else:
-                    seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ sp:tp:-1 ] ]))
-                print >> h_tig_path, "%s_%03d" % (ctg_id, h_tig_id), v, w, seq_id, sp, tp, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
-                sg[v][w]["tig_id"] = "%s_%03d" % (ctg_id, h_tig_id)
-
-                rv, rw = reverse_end(v), reverse_end(w)
-                edges_to_remove.add( (v, w) )
-                edges_to_remove.add( (rw, rv) )
-
-            print >> h_tig_fa, ">%s_%03d" % (ctg_id, h_tig_id)
-            print >> h_tig_fa, "".join(seq)
-            h_tig_id += 1
-
-
-    h_tig_fa.close()
-    h_tig_path.close()
-
-    dump_graph = False  # the code segement below is useful for showing the graph
-    if dump_graph == True:
-        for v, w in sg.edges():
-            if "h_edge" not in sg[v][w]:
-                sg[v][w]["h_edge"] = 0
-            if v in reachable_all:
-                sg.node[v]["reachable"] = 1
-            else:
-                sg.node[v]["reachable"] = 0
-            if w in reachable_all:
-                sg.node[w]["reachable"] = 1
-            else:
-                sg.node[w]["reachable"] = 0
-
-        nx.write_gexf(sg, "%s_0.gexf" % ctg_id)
-
-
-
-def parse_args(argv):
-
-    parser = argparse.ArgumentParser(description='layout haplotigs from primary assembly graph and phased aseembly graph')
-
-    parser.add_argument('--fc_asm_path', type=str, help='path to the primary Falcon assembly output directory', required=True)
-    parser.add_argument('--fc_hasm_path', type=str, help='path to the phased Falcon assembly output directory', required=True)
-    parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', default = "all", required=True)
-    parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
-    parser.add_argument('--rid_phase_map', type=str, help="path to the file that encode the relationship of the read id to phase blocks", required=True)
-    parser.add_argument('--fasta', type=str, help="sequence file of the p-reads", required=True)
-    args = parser.parse_args(argv[1:])
-
-    return args
-
-def main(argv=sys.argv):
-
-    # make life easier for now. will refactor it out if possible
-    global all_rid_to_phase
-    global p_asm_G
-    global h_asm_G
-    global all_rid_to_phase
-    global seqs
-
-    args = parse_args(argv)
-    fc_asm_path = args.fc_asm_path
-    fc_hasm_path = args.fc_hasm_path
-    ctg_id = args.ctg_id
-    base_dir = args.base_dir
-    fasta_fn = args.fasta
-
-    p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"),
-                       os.path.join(fc_asm_path, "utg_data"),
-                       os.path.join(fc_asm_path, "ctg_paths") )
-
-    h_asm_G = AsmGraph( os.path.join(fc_hasm_path, "sg_edges_list"),
-                        os.path.join(fc_hasm_path, "utg_data"),
-                        os.path.join(fc_hasm_path, "ctg_paths") )
-
-
-    all_rid_to_phase = {}
-
-    all_read_ids = set()
-    with open(args.rid_phase_map) as f:
-        for row in f:
-            row = row.strip().split()
-            all_rid_to_phase.setdefault( row[1], {} )
-            all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3]))
-            all_read_ids.add(row[0])
-
-    for v, w in p_asm_G.sg_edges:
-        if p_asm_G.sg_edges[ (v, w) ][-1] != "G":
-            continue
-        v = v.split(":")[0]
-        w = w.split(":")[0]
-        all_read_ids.add(v)
-        all_read_ids.add(w)
-
-    for v, w in h_asm_G.sg_edges:
-        if h_asm_G.sg_edges[ (v, w) ][-1] != "G":
-            continue
-        v = v.split(":")[0]
-        w = w.split(":")[0]
-        all_read_ids.add(v)
-        all_read_ids.add(w)
-
-    seqs = load_sg_seq(all_read_ids, fasta_fn)
-
-    if ctg_id == "all":
-        ctg_id_list = p_asm_G.ctg_data.keys()
-    else:
-        ctg_id_list = [ctg_id]
-
-    exe_list = []
-    for ctg_id in ctg_id_list:
-        if ctg_id[-1] != "F":
-            continue
-        if ctg_id not in all_rid_to_phase:
-            continue
-        exe_list.append( (ctg_id, os.path.join(".", ctg_id)) )
-
-    exec_pool = Pool(4)  #TODO, make this configurable
-    exec_pool.map( generate_haplotigs_for_ctg, exe_list)
-    #map( generate_haplotigs_for_ctg, exe_list)
diff --git a/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py b/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
deleted file mode 100644
index ab92956..0000000
--- a/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
+++ /dev/null
@@ -1,354 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from multiprocessing import Pool
-import subprocess as sp
-import shlex
-import argparse
-import re
-import sys
-
-arid2phase = {}
-
-def filter_stage1(input_):
-    db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len = input_
-    try:
-        ignore_rtn = []
-        current_q_id = None
-        ave_idt = 0.0
-        all_over_len = 0.0
-        overlap_data = {"5p":0, "3p":0}
-        overlap_phase = {"5p":set(), "3p":set()}
-        q_id = None
-
-        for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) ) ).splitlines():
-            l = l.strip().split()
-            q_id, t_id = l[:2]
-
-            if q_id not in arid2phase:
-                continue
-            if t_id not in arid2phase:
-                continue
-            if arid2phase[t_id][0] != arid2phase[q_id][0]:
-                continue
-
-            if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
-                continue
-
-            if q_id != None and q_id != current_q_id:
-
-                left_count = overlap_data["5p"]
-                right_count = overlap_data["3p"]
-
-                if abs(left_count - right_count) > max_diff:
-                    ignore_rtn.append( current_q_id )
-                elif left_count > max_ovlp or right_count > max_ovlp:
-                    ignore_rtn.append( current_q_id )
-                elif left_count < min_ovlp or right_count < min_ovlp:
-                    ignore_rtn.append( current_q_id )
-
-                #remove unphased reads if they are sandwiched by the reads from the same phase
-                if current_q_id not in arid2phase:
-                    left_set = overlap_phase["5p"]
-                    right_set = overlap_phase["3p"]
-                    if len( left_set & right_set ) > 0:
-                        ignore_rtn.append( current_q_id )
-
-                overlap_data = {"5p":0, "3p":0}
-                overlap_phase = {"5p":set(), "3p":set()}
-                current_q_id = q_id
-                ave_idt = 0.0
-                all_over_len = 0.0
-
-            overlap_len = -int(l[2])
-            idt = float(l[3])
-            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
-            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
-
-            if idt < 90:
-                continue
-
-            if q_l < min_len or t_l < min_len:
-                continue
-
-            if l[-1] in ("contains", "overlap"):
-                ave_idt += idt * overlap_len
-                all_over_len += overlap_len
-            if q_s == 0:
-                overlap_data["5p"] += 1
-                if t_id in arid2phase:
-                    overlap_phase["5p"].add( arid2phase[t_id] )
-            if q_e == q_l:
-                overlap_data["3p"] += 1
-                if t_id in arid2phase:
-                    overlap_phase["3p"].add( arid2phase[t_id] )
-
-        if q_id !=  None:
-            left_count = overlap_data["5p"]
-            right_count = overlap_data["3p"]
-            if abs(left_count - right_count) > max_diff:
-                ignore_rtn.append( current_q_id )
-            elif left_count > max_ovlp or right_count > max_ovlp:
-                ignore_rtn.append( current_q_id )
-            elif left_count < min_ovlp or right_count < min_ovlp:
-                ignore_rtn.append( current_q_id )
-
-            #remove unphased reads if they are sandwiched by the reads from the same phase
-            if current_q_id not in arid2phase:
-                left_set = overlap_phase["5p"]
-                right_set = overlap_phase["3p"]
-                if len( left_set & right_set ) > 0:
-                    ignore_rtn.append( current_q_id )
-
-
-        return fn, ignore_rtn
-
-    except (KeyboardInterrupt, SystemExit):
-        return
-
-def filter_stage2(input_):
-    db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set = input_
-    try:
-        contained_id = set()
-        for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn))).splitlines():
-            l = l.strip().split()
-            q_id, t_id = l[:2]
-
-
-            if q_id not in arid2phase:
-                continue
-            if t_id not in arid2phase:
-                continue
-
-            if arid2phase[t_id][0] != arid2phase[q_id][0]:
-                continue
-            if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
-                continue
-
-
-            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
-            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
-
-            idt = float(l[3])
-            if idt < 90:
-                continue
-
-            if q_l < min_len or t_l < min_len:
-                continue
-
-            if q_id in ignore_set:
-                continue
-            if t_id in ignore_set:
-                continue
-            if l[-1] == "contained":
-                contained_id.add(q_id)
-            if l[-1] == "contains":
-                contained_id.add(t_id)
-        return fn, contained_id
-
-    except (KeyboardInterrupt, SystemExit):
-        return
-
-def filter_stage3(input_):
-
-    db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn = input_
-
-    overlap_data = {"5p":[], "3p":[]}
-    try:
-        ovlp_output = []
-        current_q_id = None
-        for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) )).splitlines():
-            l = l.strip().split()
-            q_id, t_id = l[:2]
-
-
-            if q_id not in arid2phase:
-                continue
-            if t_id not in arid2phase:
-                continue
-
-            if arid2phase[t_id][0] != arid2phase[q_id][0]:
-                continue
-            if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
-                continue
-
-            if current_q_id == None:
-                current_q_id = q_id
-                overlap_data = {"5p":[], "3p":[]}
-
-            elif q_id != current_q_id:
-                left = overlap_data["5p"]
-                right = overlap_data["3p"]
-                left.sort()
-                right.sort()
-
-                for i in xrange(len(left)):
-                    inphase, score, m_range, ovlp = left[i]
-                    ovlp_output.append(ovlp)
-                    #print " ".join(ovlp), read_end_data[current_q_id]
-                    if i >= bestn and m_range > 1000:
-                        break
-
-                for i in xrange(len(right)):
-                    inphase, score, m_range, ovlp = right[i]
-                    ovlp_output.append(ovlp)
-                    #print " ".join(ovlp), read_end_data[current_q_id]
-                    if i >= bestn and m_range > 1000:
-                        break
-
-                overlap_data = {"5p":[], "3p":[]}
-                current_q_id = q_id
-
-            if q_id in contained_set:
-                continue
-            if t_id in contained_set:
-                continue
-            if q_id in ignore_set:
-                continue
-            if t_id in ignore_set:
-                continue
-
-            overlap_len = -int(l[2])
-            idt = float(l[3])
-            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
-            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
-
-            if idt < 90:
-                continue
-            if q_l < min_len or t_l < min_len:
-                continue
-            if q_s == 0:
-                l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
-                inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
-                #nphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
-                overlap_data["5p"].append( (-inphase, -overlap_len,  t_l - (t_e - t_s),  l ) )
-            elif q_e == q_l:
-                l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
-                inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
-                #inphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
-                overlap_data["3p"].append( (-inphase, -overlap_len, t_l - (t_e - t_s), l ) )
-
-        left = overlap_data["5p"]
-        right = overlap_data["3p"]
-        left.sort()
-        right.sort()
-
-
-        for i in xrange(len(left)):
-            inphase, score, m_range, ovlp = left[i]
-            ovlp_output.append(ovlp)
-            #print " ".join(ovlp), read_end_data[current_q_id]
-            if i >= bestn and m_range > 1000:
-                break
-
-        for i in xrange(len(right)):
-            inphase, score, m_range, ovlp = right[i]
-            ovlp_output.append(ovlp)
-            #print " ".join(ovlp), read_end_data[current_q_id]
-            if i >= bestn and m_range > 1000:
-                break
-
-        return fn, ovlp_output
-    except (KeyboardInterrupt, SystemExit):
-        return
-
-
-def parse_args(argv):
-
-    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
-    parser.add_argument('--n_core', type=int, default=4,
-                        help='number of processes used for generating consensus')
-    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
-    parser.add_argument('--db', type=str, help='read db file path')
-    parser.add_argument('--max_diff', type=int, help="max difference of 5' and 3' coverage")
-    parser.add_argument('--max_cov', type=int, help="max coverage of 5' or 3' coverage")
-    parser.add_argument('--min_cov', type=int, help="min coverage of 5' or 3' coverage")
-    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
-    parser.add_argument('--bestn', type=int, default=10, help="output at least best n overlaps on 5' or 3' ends if possible")
-    parser.add_argument('--rid_phase_map', type=str, help="the file that encode the relationship of the read id to phase blocks", required=True)
-    args = parser.parse_args(argv[1:])
-
-    return args
-
-def main(argv=sys.argv):
-    args = parse_args(argv)
-
-    max_diff = args.max_diff
-    max_cov = args.max_cov
-    min_cov = args.min_cov
-    min_len = args.min_len
-    bestn = args.bestn
-    db_fn = args.db
-
-    with open(args.rid_phase_map) as f:
-        for row in f:
-            row = row.strip().split()
-            arid2phase[row[0]] = (row[1], row[2], row[3]) #ctg_id, phase_blk_id, phase_id
-
-    exe_pool = Pool(args.n_core)
-
-    file_list = open(args.fofn).read().split("\n")
-    inputs = []
-    for fn in file_list:
-        if len(fn) != 0:
-            inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len) )
-
-    ignore_all = []
-    for res in exe_pool.imap(filter_stage1, inputs):
-        ignore_all.extend( res[1] )
-
-    inputs = []
-    ignore_all = set(ignore_all)
-    for fn in file_list:
-        if len(fn) != 0:
-            inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all) )
-    contained = set()
-    for res in exe_pool.imap(filter_stage2, inputs):
-        contained.update(res[1])
-        #print res[0], len(res[1]), len(contained)
-
-    #print "all", len(contained)
-    inputs = []
-    ignore_all = set(ignore_all)
-    for fn in file_list:
-        if len(fn) != 0:
-            inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn) )
-    for res in exe_pool.imap(filter_stage3, inputs):
-        for l in res[1]:
-            print " ".join(l)
diff --git a/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py b/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
deleted file mode 100644
index 772f011..0000000
--- a/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
+++ /dev/null
@@ -1,1545 +0,0 @@
-import networkx as nx
-import os
-import shlex
-import sys
-import subprocess
-import argparse
-
-DEBUG_LOG_LEVEL = 0
-
-class SGNode(object):
-    """
-    class representing a node in the string graph
-    """
-    def __init__(self, node_name):
-        self.name = node_name
-        self.out_edges = []
-        self.in_edges = []
-
-    def add_out_edge(self, out_edge):
-        self.out_edges.append(out_edge)
-
-    def del_out_edge(self, out_edge):
-        self.out_edges.remove(out_edge)
-
-    def add_in_edge(self, in_edge):
-        self.in_edges.append(in_edge)
-
-    def del_in_edge(self, in_edge):
-        self.in_edges.remove(in_edge)
-
-class SGEdge(object):
-    """
-    class representing an edge in the string graph
-    """
-    def __init__(self, in_node, out_node):
-        self.in_node = in_node
-        self.out_node = out_node
-        self.attr = {}
-    def set_attribute(self, attr, value):
-        self.attr[attr] = value
-
-def reverse_end( node_id ):
-    node_id, end = node_id.split(":")
-    new_end = "B" if end == "E" else "E"
-    return node_id + ":" + new_end
-
-class StringGraph(object):
-    """
-    class representing the string graph
-    """
-    def __init__(self):
-        self.nodes = {}
-        self.edges = {}
-        self.n_mark = {}
-        self.e_reduce = {}
-        self.repeat_overlap = {}
-
-    def add_node(self, node_name):
-        """
-        add a node into the graph by given a node name
-        """
-        if node_name not in self.nodes:
-            self.nodes[node_name] = SGNode(node_name)
-
-    def add_edge(self, in_node_name, out_node_name, **attributes):
-        """
-        add an edge into the graph by given a pair of nodes
-        """
-        if (in_node_name, out_node_name) not in self.edges:
-
-            self.add_node(in_node_name)
-            self.add_node(out_node_name)
-            in_node = self.nodes[in_node_name]
-            out_node = self.nodes[out_node_name]
-
-            edge = SGEdge(in_node, out_node)
-            self.edges[ (in_node_name, out_node_name) ] = edge
-            in_node.add_out_edge(edge)
-            out_node.add_in_edge(edge)
-        edge =  self.edges[ (in_node_name, out_node_name) ]
-        for k, v in attributes.items():
-            edge.attr[k] = v
-
-    def remove_cross_phase_edges(self):
-
-        to_remove = {}
-
-        for k,e in self.edges.items():
-            if e.attr["phase"][0] != e.attr["phase"][1]:
-                to_remove[k] = e
-
-        for k, e in to_remove.items():
-            in_node_name, out_node_name = k
-            in_node = self.nodes[in_node_name]
-            out_node = self.nodes[out_node_name]
-            in_node.del_out_edge(e)
-            out_node.del_in_edge(e)
-            del self.edges[k]
-
-        return to_remove
-
-    def init_reduce_dict(self):
-        for e in self.edges:
-            self.e_reduce[e] = False
-
-    def bfs_nodes(self, n, exclude = None, depth=5):
-        all_nodes = set()
-        all_nodes.add(n)
-        candidate_nodes = set()
-        candidate_nodes.add(n)
-        dp = 1
-        while  dp < depth and len(candidate_nodes) > 0:
-            v = candidate_nodes.pop()
-            for e in v.out_edges :
-                w = e.out_node
-                if w == exclude:
-                    continue
-                if w not in all_nodes:
-                    all_nodes.add(w)
-                    if len(w.out_edges) > 0:
-                        candidate_nodes.add(w)
-            dp += 1
-
-        return all_nodes
-
-
-    def mark_chimer_edges(self):
-
-        multi_in_nodes = {}
-        multi_out_nodes = {}
-        for n_name in self.nodes:
-            n = self.nodes[n_name]
-            out_nodes = [ e.out_node for e in n.out_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False ]
-            in_nodes = [e.in_node for e in n.in_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False]
-
-            if len(out_nodes) >= 2:
-                multi_out_nodes[n_name] = out_nodes
-            if len(in_nodes) >= 2:
-                multi_in_nodes[n_name] = in_nodes
-
-        chimer_candidates = set()
-        out_set = set()
-        in_set = set()
-        for n_name in multi_out_nodes:
-            out_nodes = set(multi_out_nodes[n_name])
-            out_set |= out_nodes
-
-        for n_name in multi_in_nodes:
-            in_nodes = set(multi_in_nodes[n_name])
-            in_set |= in_nodes
-
-        chimer_candidates = out_set & in_set
-
-        chimer_nodes = []
-        chimer_edges = set()
-        for n in chimer_candidates:
-            out_nodes = set( [ e.out_node for e in n.out_edges ] )
-            test_set = set()
-            for in_node in [e.in_node for e in n.in_edges ]:
-                test_set = test_set | set( [ e.out_node for e in in_node.out_edges ] )
-            test_set -= set([n])
-            if len( out_nodes & test_set ) == 0:
-                flow_node1 = set()
-                flow_node2 = set()
-                for v in list(out_nodes):
-                    flow_node1 |= self.bfs_nodes(v, exclude=n)
-                for v in list(test_set):
-                    flow_node2 |= self.bfs_nodes(v, exclude=n)
-                if len( flow_node1 & flow_node2 ) == 0:
-                    for e in n.out_edges:
-                        v, w =  e.in_node.name, e.out_node.name
-                        if self.e_reduce[ (v, w) ] != True:
-                            self.e_reduce[ (v, w) ] = True
-                            chimer_edges.add( (v, w) )
-                            rv = reverse_end(w)
-                            rw = reverse_end(v)
-                            self.e_reduce[ (rv, rw) ] = True
-                            chimer_edges.add( (rv, rw) )
-
-                    for e in n.in_edges:
-                        v, w =  e.in_node.name, e.out_node.name
-                        if self.e_reduce[ (v, w) ] != True:
-                            self.e_reduce[ (v, w) ] = True
-                            chimer_edges.add( (v, w) )
-                            rv = reverse_end(w)
-                            rw = reverse_end(v)
-                            self.e_reduce[ (rv, rw) ] = True
-                            chimer_edges.add( (rv, rw) )
-                    chimer_nodes.append( n.name )
-                    chimer_nodes.append( reverse_end(n.name) )
-
-        return chimer_nodes, chimer_edges
-
-    def mark_spur_edge(self):
-
-        removed_edges = set()
-        for  v in self.nodes:
-            if len(self.nodes[v].out_edges) > 1:
-                for out_edge in self.nodes[v].out_edges:
-                    w = out_edge.out_node.name
-
-                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
-                        self.e_reduce[(v, w)] = True
-                        removed_edges.add( (v, w) )
-                        v2, w2 = reverse_end(w), reverse_end(v)
-                        self.e_reduce[(v2, w2)] = True
-                        removed_edges.add( (v2, w2) )
-
-            if len(self.nodes[v].in_edges) > 1:
-                for in_edge in self.nodes[v].in_edges:
-                    w = in_edge.in_node.name
-                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
-                        self.e_reduce[(w, v)] = True
-                        removed_edges.add( (w, v) )
-                        v2, w2 = reverse_end(w), reverse_end(v)
-                        self.e_reduce[(w2, v2)] = True
-                        removed_edges.add( (w2, v2) )
-        return removed_edges
-
-    def mark_tr_edges(self):
-        """
-        transitive reduction
-        """
-        n_mark = self.n_mark
-        e_reduce = self.e_reduce
-        FUZZ = 500
-        for n in self.nodes:
-            n_mark[n] = "vacant"
-
-        for n_name, node in self.nodes.items():
-
-            out_edges = node.out_edges
-            if len(out_edges) == 0:
-                continue
-
-            out_edges.sort(key=lambda x: x.attr["length"])
-
-            for e in out_edges:
-                w = e.out_node
-                n_mark[ w.name ] = "inplay"
-
-            max_len = out_edges[-1].attr["length"]
-
-            max_len += FUZZ
-
-            for e in out_edges:
-                e_len = e.attr["length"]
-                w = e.out_node
-                if n_mark[w.name] == "inplay":
-                    w.out_edges.sort( key=lambda x: x.attr["length"] )
-                    for e2 in w.out_edges:
-                        if e2.attr["length"] + e_len < max_len:
-                            x = e2.out_node
-                            if n_mark[x.name] == "inplay":
-                                n_mark[x.name] = "eliminated"
-
-            for e in out_edges:
-                e_len = e.attr["length"]
-                w = e.out_node
-                w.out_edges.sort( key=lambda x: x.attr["length"] )
-                if len(w.out_edges) > 0:
-                    x = w.out_edges[0].out_node
-                    if n_mark[x.name] == "inplay":
-                        n_mark[x.name] = "eliminated"
-                for e2 in w.out_edges:
-                    if e2.attr["length"] < FUZZ:
-                        x = e2.out_node
-                        if n_mark[x.name] == "inplay":
-                            n_mark[x.name] = "eliminated"
-
-            for out_edge in out_edges:
-                v = out_edge.in_node
-                w = out_edge.out_node
-                if n_mark[w.name] == "eliminated":
-                    e_reduce[ (v.name, w.name) ] = True
-                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
-                    e_reduce[(v_name, w_name)] = True
-                n_mark[w.name] = "vacant"
-
-
-    def mark_best_overlap(self):
-        """
-        find the best overlapped edges
-        """
-
-        best_edges = set()
-        removed_edges = set()
-
-        for v in self.nodes:
-
-            out_edges = self.nodes[v].out_edges
-            if len(out_edges) > 0:
-                out_edges.sort(key=lambda e: -e.attr["score"])
-                for e in out_edges:
-                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
-                        best_edges.add( (e.in_node.name, e.out_node.name) )
-                        break
-
-            in_edges = self.nodes[v].in_edges
-            if len(in_edges) > 0:
-                in_edges.sort(key=lambda e: -e.attr["score"])
-                for e in in_edges:
-                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
-                        best_edges.add( (e.in_node.name, e.out_node.name) )
-                        break
-
-        if DEBUG_LOG_LEVEL > 1:
-            print "X", len(best_edges)
-
-        for e_n, e in self.edges.items():
-            v = e_n[0]
-            w = e_n[1]
-            if self.e_reduce[ (v, w) ] != True:
-                if (v, w) not in best_edges:
-                    self.e_reduce[(v, w)] = True
-                    removed_edges.add( (v, w) )
-                    v2, w2 = reverse_end(w), reverse_end(v)
-                    self.e_reduce[(v2, w2)] = True
-                    removed_edges.add( (v2, w2) )
-
-        return removed_edges
-
-    def resolve_repeat_edges(self):
-
-
-        edges_to_reduce = []
-        nodes_to_test = set()
-        for v_n, v in self.nodes.items():
-
-            out_nodes = []
-            for e in v.out_edges:
-                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
-                    out_nodes.append( e.out_node.name )
-
-            in_nodes = []
-            for e in v.in_edges:
-                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
-                    in_nodes.append( e.in_node.name )
-
-            if len(out_nodes) == 1 and len(in_nodes)  == 1:
-                nodes_to_test.add(v_n)
-
-        for v_n in list( nodes_to_test ):
-
-            v = self.nodes[v_n]
-
-            out_nodes = []
-            for e in v.out_edges:
-                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
-                    out_nodes.append( e.out_node.name )
-
-            in_nodes = []
-            for e in v.in_edges:
-                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
-                    in_nodes.append( e.in_node.name )
-
-            in_node_name = in_nodes[0]
-
-            for out_edge in self.nodes[in_node_name].out_edges:
-                vv = out_edge.in_node.name
-                ww = out_edge.out_node.name
-
-                ww_out = self.nodes[ww].out_edges
-                v_out = self.nodes[v_n].out_edges
-                ww_out_nodes = set( [ n.out_node.name for n in ww_out] )
-                v_out_nodes = set(  [ n.out_node.name for n in v_out] )
-                o_overlap = len( ww_out_nodes & v_out_nodes )
-
-                ww_in_count = 0
-                for e in self.nodes[ww].in_edges:
-                    if self.e_reduce[ ( e.in_node.name, e.out_node.name ) ] == False:
-                        ww_in_count += 1
-
-                if ww != v_n and\
-                   self.e_reduce[ (vv, ww) ] == False and\
-                   ww_in_count > 1 and\
-                   ww not in nodes_to_test and\
-                   o_overlap == 0:
-                    edges_to_reduce.append( (vv, ww) )
-
-            out_node_name = out_nodes[0]
-
-            for in_edge in self.nodes[out_node_name].in_edges:
-                vv = in_edge.in_node.name
-                ww = in_edge.out_node.name
-
-                vv_in = self.nodes[vv].in_edges
-                v_in = self.nodes[v_n].in_edges
-                vv_in_nodes = set( [ n.in_node.name for n in vv_in] )
-                v_in_nodes = set(  [ n.in_node.name for n in v_in] )
-                i_overlap = len( vv_in_nodes & v_in_nodes )
-
-                vv_out_count = 0
-                for e in self.nodes[vv].out_edges:
-                    if self.e_reduce[ ( e.in_node.name, e.out_node.name )] == False:
-                        vv_out_count += 1
-
-                if vv != v_n and\
-                   self.e_reduce[ (vv, ww) ] == False and\
-                   vv_out_count > 1 and\
-                   vv not in nodes_to_test and\
-                   i_overlap == 0:
-                    edges_to_reduce.append( (vv, ww) )
-
-        removed_edges = set()
-        for e in edges_to_reduce:
-            self.e_reduce[e] = True
-            removed_edges.add(e)
-
-        return removed_edges
-
-    def get_out_edges_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].out_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        return rtn
-
-
-    def get_in_edges_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].in_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        return rtn
-
-    def get_best_out_edge_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].out_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        rtn.sort(key=lambda e: e.attr["score"])
-
-        return rtn[-1]
-
-    def get_best_in_edge_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].in_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        rtn.sort(key=lambda e: e.attr["score"])
-        return rtn[-1]
-
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-def generate_seq_from_path(sg, seqs, path):
-    subseqs = []
-    r_id, end = path[0].split(":")
-
-    count = 0
-    for i in range( len( path ) -1 ):
-        w_n, v_n = path[i:i+2]
-        edge = sg.edges[ (w_n, v_n ) ]
-        read_id, coor = edge.attr["label"].split(":")
-        b,e = coor.split("-")
-        b = int(b)
-        e = int(e)
-        if b < e:
-            subseqs.append( seqs[read_id][b:e] )
-        else:
-            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
-
-    return "".join(subseqs)
-
-
-def reverse_edge( e ):
-    e1, e2 = e
-    return reverse_end(e2), reverse_end(e1)
-
-def reverse_path( p ):
-    p = p[::-1]
-    return [reverse_end(n) for n in p]
-
-
-def find_bundle(ug, u_edge_data, start_node, depth_cutoff, width_cutoff, length_cutoff):
-
-    tips = set()
-    bundle_edges = set()
-    bundle_nodes = set()
-
-    local_graph = nx.ego_graph(ug, start_node, depth_cutoff, undirected=False)
-    length_to_node = {start_node:0}
-    score_to_node = {start_node:0}
-
-    v = start_node
-    end_node = start_node
-
-    if DEBUG_LOG_LEVEL > 1:
-        print
-        print
-        print "start", start_node
-
-    bundle_nodes.add(v)
-    for vv, ww, kk in local_graph.out_edges(v, keys = True):
-        max_score = 0
-        max_length = 0
-
-        if (vv, ww, kk) not in bundle_edges and\
-                reverse_end(ww) not in bundle_nodes:
-
-            bundle_edges.add( (vv, ww, kk) )
-            tips.add(ww)
-
-    for v in list(tips):
-        bundle_nodes.add(v)
-
-    depth = 1
-    width = 1.0
-    converage = False
-
-
-    while 1:
-        if DEBUG_LOG_LEVEL > 1:
-            print "# of tips", len(tips)
-
-        if len(tips) > 4:
-            converage = False
-            break
-
-        if len(tips) == 1:
-            end_node = tips.pop()
-
-            if DEBUG_LOG_LEVEL > 1:
-                print "end", end_node
-
-            if end_node not in length_to_node:
-                v = end_node
-                max_score_edge = None
-                max_score = 0
-                for uu, vv, kk in local_graph.in_edges(v, keys=True):
-                    if uu not in length_to_node:
-                        continue
-
-                    score = u_edge_data[ (uu, vv, kk) ][1]
-
-                    if score > max_score:
-
-                        max_score = score
-                        max_score_edge = (uu, vv, kk)
-
-                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
-                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
-
-
-            converage = True
-            break
-
-
-        depth += 1
-        width = 1.0 * len(bundle_edges) / depth
-
-        if depth > 10 and width > width_cutoff:
-            converage = False
-            break
-
-        if depth > depth_cutoff:
-            converage = False
-            break
-
-        tips_list = list(tips)
-
-        tip_updated = False
-        loop_detect = False
-        length_limit_reached = False
-
-        for v in tips_list:
-            if DEBUG_LOG_LEVEL > 1:
-                print "process", v
-
-            if len(local_graph.out_edges(v, keys=True)) == 0: # dead end route
-                print "no out edge", v
-                continue
-
-            max_score_edge = None
-            max_score = 0
-
-            extend_tip = True
-
-            for uu, vv, kk in local_graph.in_edges(v, keys=True):
-                if DEBUG_LOG_LEVEL > 1:
-                    print "in_edges", uu, vv, kk
-                    print uu, "in length_to_node",  uu in length_to_node
-
-                if uu not in length_to_node:
-                    extend_tip = False
-                    break
-
-                score = u_edge_data[ (uu, vv, kk) ][1]
-
-                if score > max_score:
-
-                    max_score = score
-                    max_score_edge = (uu, vv, kk)
-
-            if extend_tip:
-
-                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
-                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
-
-                if length_to_node[v] > length_cutoff:
-                    length_limit_reached = True
-                    converage = False
-                    break
-
-                v_updated = False
-                for vv, ww, kk in local_graph.out_edges(v, keys=True):
-
-                    if DEBUG_LOG_LEVEL > 1:
-                        print "test", vv, ww, kk
-
-                    if ww in length_to_node:
-                        loop_detect = True
-                        if DEBUG_LOG_LEVEL > 1:
-                            print "loop_detect", ww
-                        break
-
-                    if (vv, ww, kk) not in bundle_edges and\
-                            reverse_end(ww) not in bundle_nodes:
-
-                        if DEBUG_LOG_LEVEL > 1:
-                            print "add", ww
-
-                        tips.add(ww)
-                        bundle_edges.add( (vv, ww, kk) )
-                        tip_updated = True
-                        v_updated = True
-
-                if v_updated:
-
-                    if DEBUG_LOG_LEVEL > 1:
-                        print "remove", v
-
-                    tips.remove(v)
-
-                    if len(tips) == 1:
-                        break
-
-            if loop_detect:
-                converage = False
-                break
-
-        if length_limit_reached:
-            converage = False
-            break
-
-        if loop_detect:
-            converage = False
-            break
-
-        if not tip_updated:
-            converage = False
-            break
-
-        for v in list(tips):
-            bundle_nodes.add(v)
-
-
-
-    data = start_node, end_node, bundle_edges, length_to_node[end_node], score_to_node[end_node], depth
-
-    data_r = None
-
-    if DEBUG_LOG_LEVEL > 1:
-        print converage, data, data_r
-    return converage, data, data_r
-
-def generate_string_graph(args):
-
-    overlap_file = args.overlap_file
-
-    contained_reads = set()
-    chimer_ids = set()
-
-    filter_reads = False
-
-    seqs = set()
-
-    G=nx.Graph()
-    edges =set()
-    overlap_data = []
-    contained_reads = set()
-    overlap_count = {}
-
-
-    # loop through the overlapping data to load the data in the a python array
-    # contained reads are identified
-
-    with open(overlap_file) as f:
-        for l in f:
-            l = l.strip().split()
-
-
-            #work around for some ill formed data recored
-            #if len(l) != 13:
-            #    continue
-
-            f_id, g_id, score, identity = l[:4]
-
-            if f_id == g_id:  # don't need self-self overlapping
-                continue
-
-            if filter_reads:
-
-                if g_id not in seqs:
-                    continue
-
-                if f_id not in seqs:
-                    continue
-
-            phase1 = l[13]
-            phase2 = l[14]
-
-            score = int(score)
-            identity = float(identity)
-            contained = l[12]
-            if contained == "contained" and phase1 == phase2:
-                contained_reads.add(f_id)
-                continue
-            if contained == "contains" and phase1 == phase2:
-                contained_reads.add(g_id)
-                continue
-            if contained == "none":
-                continue
-
-            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
-                continue
-            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
-            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
-
-            # only used reads longer than the 4kb for assembly
-            if f_len < args.min_len: continue
-            if g_len < args.min_len: continue
-
-            """
-            # double check for proper overlap
-            # this is not necessary when using DALIGNER for overlapper
-            # it may be useful if other overlappers give fuzzier alignment boundary
-            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
-                continue
-
-            if g_start > 24 and g_len - g_end > 24:
-                continue
-
-            if g_strain == 0:
-                if f_start < 24 and g_len - g_end > 24:
-                    continue
-                if g_start < 24 and f_len - f_end > 24:
-                    continue
-            else:
-                if f_start < 24 and g_start > 24:
-                    continue
-                if g_start < 24 and f_start > 24:
-                    continue
-            """
-
-            overlap_data.append( (f_id, g_id, score, identity,
-                                  f_strain, f_start, f_end, f_len,
-                                  g_strain, g_start, g_end, g_len,
-                                  phase1, phase2) )
-
-            overlap_count[f_id] = overlap_count.get(f_id,0)+1
-            overlap_count[g_id] = overlap_count.get(g_id,0)+1
-
-    overlap_set = set()
-    sg = StringGraph()
-    for od in overlap_data:
-        f_id, g_id, score, identity = od[:4]
-        if f_id in contained_reads:
-            continue
-        if g_id in contained_reads:
-            continue
-        f_s, f_b, f_e, f_l = od[4:8]
-        g_s, g_b, g_e, g_l = od[8:12]
-
-        phase1 = od[12]
-        phase2 = od[13]
-
-        overlap_pair = [f_id, g_id]
-        overlap_pair.sort()
-        overlap_pair = tuple( overlap_pair )
-        if overlap_pair in overlap_set:  # don't allow duplicated records
-            continue
-        else:
-            overlap_set.add(overlap_pair)
-
-
-        if g_s == 1: # revered alignment, swapping the begin and end coordinates
-            g_b, g_e = g_e, g_b
-
-        # build the string graph edges for each overlap
-        if f_b > 1:
-            if g_b < g_e:
-                """
-                     f.B         f.E
-                  f  ----------->
-                  g         ------------->
-                            g.B           g.E
-                """
-                if f_b == 0 or g_e - g_l == 0:
-                    continue
-                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
-                                                           length = abs(f_b-0),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase = (phase2, phase1))
-                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = (g_id, g_e, g_l),
-                                                           length = abs(g_e-g_l),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase = (phase1, phase2))
-            else:
-                """
-                     f.B         f.E
-                  f  ----------->
-                  g         <-------------
-                            g.E           g.B
-                """
-                if f_b == 0 or g_e == 0:
-                    continue
-                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
-                                                           length = abs(f_b -0),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase = (phase2, phase1))
-                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = (g_id, g_e, 0),
-                                                           length = abs(g_e- 0),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase = (phase1, phase2))
-        else:
-            if g_b < g_e:
-                """
-                                    f.B         f.E
-                  f                 ----------->
-                  g         ------------->
-                            g.B           g.E
-                """
-                if g_b == 0 or f_e - f_l == 0:
-                    continue
-                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = (g_id, g_b, 0),
-                                                           length = abs(g_b - 0),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase = (phase1, phase2))
-                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
-                                                           length = abs(f_e-f_l),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase = (phase2, phase1))
-            else:
-                """
-                                    f.B         f.E
-                  f                 ----------->
-                  g         <-------------
-                            g.E           g.B
-                """
-                if g_b - g_l == 0 or f_e - f_l ==0:
-                    continue
-                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = (g_id, g_b, g_l),
-                                                           length = abs(g_b - g_l),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase = (phase1, phase2))
-                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
-                                                           length = abs(f_e - f_l),
-                                                           score = -score,
-                                                           identity = identity,
-                                                           phase=(phase2, phase1))
-
-
-    cp_edges = sg.remove_cross_phase_edges()
-    sg.init_reduce_dict()
-
-    #if not args.disable_chimer_prediction:
-    #    sg.mark_chimer_edges()
-    #sg.mark_spur_edge()
-
-
-    sg.mark_tr_edges() # mark those edges that transitive redundant
-
-    if DEBUG_LOG_LEVEL > 1:
-        print sum( [1 for c in sg.e_reduce.values() if c == True] )
-        print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-    chimer_nodes, chimer_edges = sg.mark_chimer_edges()
-
-    removed_edges = set()
-    if args.lfc == True:
-        removed_edges = sg.resolve_repeat_edges()
-    else:
-        removed_edges = sg.mark_best_overlap() # mark those edges that are best overlap edges
-
-    spur_edges = sg.mark_spur_edge()
-
-    if DEBUG_LOG_LEVEL > 1:
-        print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-    #max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
-    out_f = open("sg_edges_list", "w")
-    nxsg = nx.DiGraph()
-    edge_data = {}
-    for v, w in sg.edges:
-        e = sg.edges[ (v, w) ]
-        rid, sp, tp = e.attr["label"]
-        score = e.attr["score"]
-        identity = e.attr["identity"]
-        length = abs(sp-tp)
-
-
-        if  sg.e_reduce[(v, w)] != True:
-            type_ = "G"
-            label = "%s:%d-%d" % (rid, sp, tp)
-            nxsg.add_edge(v, w, label = label, length = length, score = score)
-            edge_data[ (v, w) ] = (rid, sp, tp, length, score, identity, type_)
-        elif (v, w) in chimer_edges:
-            type_ = "C"
-        elif (v, w) in removed_edges:
-            type_ = "R"
-        elif (v, w) in spur_edges:
-            type_ = "S"
-        elif sg.e_reduce[(v, w)] == True:
-            type_ = "TR"
-
-        print >>out_f, v, w, rid, sp, tp, score, identity, type_
-
-    for k,e in cp_edges.items():
-        v = k[0]
-        w = k[1]
-        rid, sp, tp = e.attr["label"]
-        score = e.attr["score"]
-        identity = e.attr["identity"]
-        length = abs(sp-tp)
-        print >>out_f, v, w, rid, sp, tp, score, identity, "CP"
-
-
-    out_f.close()
-    nxsg_r = nxsg.reverse()
-
-    return nxsg, nxsg_r, edge_data
-
-
-
-def construct_compound_paths(ug, u_edge_data):
-
-    source_nodes = set()
-    sink_nodes = set()
-    simple_nodes = set()
-    branch_nodes = set()
-
-    all_nodes = ug.nodes()
-    for n in all_nodes:
-        in_degree = len( ug.in_edges(n) )
-        out_degree = len( ug.out_edges(n) )
-        if in_degree == 0:
-            source_nodes.add(n)
-        if out_degree == 0:
-            sink_nodes.add(n)
-        if in_degree == 1 and out_degree == 1:
-            simple_nodes.add(n)
-        if in_degree > 1 or out_degree > 1:
-            branch_nodes.add(n)
-
-    #print "#", len(all_nodes),len(source_nodes), len(sink_nodes), len(simple_nodes), len(branch_nodes)
-    compound_paths_0 = []
-    for p in list(branch_nodes):
-        if ug.out_degree(p) > 1:
-            coverage, data, data_r =  find_bundle(ug, u_edge_data, p, 48, 16, 500000)
-            if coverage == True:
-                start_node, end_node, bundle_edges, length, score, depth = data
-                compound_paths_0.append(  (start_node, "NA", end_node, 1.0*len(bundle_edges)/depth, length, score, bundle_edges ) )
-
-    compound_paths_0.sort( key=lambda x: -len(x[6]) )
-
-
-    edge_to_cpath = {}
-    compound_paths_1 = {}
-    for s, v, t, width, length, score, bundle_edges in compound_paths_0:
-        if DEBUG_LOG_LEVEL > 1:
-            print "constructing utg, test ", s,v, t
-
-        overlapped = False
-        for vv, ww, kk in list(bundle_edges):
-            if (vv, ww, kk) in edge_to_cpath:
-                if DEBUG_LOG_LEVEL > 1:
-                    print "remove overlapped utg", (s, v, t), (vv, ww, kk)
-                overlapped = True
-                break
-            rvv = reverse_end(vv)
-            rww = reverse_end(ww)
-            rkk = reverse_end(kk)
-            if (rww, rvv, rkk) in edge_to_cpath:
-                if DEBUG_LOG_LEVEL > 1:
-                    print "remove overlapped r utg", (s, v, t),  (rww, rvv, rkk)
-                overlapped = True
-                break
-
-
-        if not overlapped:
-            if DEBUG_LOG_LEVEL > 1:
-                print "constructing", s,v, t
-
-            bundle_edges_r = []
-            rs = reverse_end(t)
-            rt = reverse_end(s)
-
-            for vv, ww, kk in list(bundle_edges):
-                edge_to_cpath.setdefault( (vv, ww, kk), set() )
-                edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
-                rvv = reverse_end(ww)
-                rww = reverse_end(vv)
-                rkk = reverse_end(kk)
-                edge_to_cpath.setdefault( (rvv, rww, rkk), set() )
-                edge_to_cpath[ (rvv, rww, rkk) ].add( (rs, rt, v) ) #assert v == "NA"
-                bundle_edges_r.append(  (rvv, rww, rkk) )
-
-            compound_paths_1[ ( s, v, t) ] = width, length, score, bundle_edges
-            compound_paths_1[ ( rs, v, rt) ] = width, length, score, bundle_edges_r
-
-
-    compound_paths_2 = {}
-    edge_to_cpath = {}
-    for s, v, t in compound_paths_1:
-        rs = reverse_end(t)
-        rt = reverse_end(s)
-        if (rs, "NA", rt) not in compound_paths_1:
-            if DEBUG_LOG_LEVEL > 1:
-                print "non_compliment bundle", s, v, t, len(compound_paths_1[( s, v, t)][-1])
-            continue
-        width, length, score, bundle_edges = compound_paths_1[ (s, v, t) ]
-        compound_paths_2[ (s, v, t) ] = width, length, score, bundle_edges
-        for vv, ww, kk in list(bundle_edges):
-            edge_to_cpath.setdefault( (vv, ww, kk), set() )
-            edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
-
-
-    compound_paths_3 = {}
-    for k, val in compound_paths_2.items():
-
-        start_node, NA, end_node = k
-        rs = reverse_end(end_node)
-        rt = reverse_end(start_node)
-        assert (rs, "NA", rt) in compound_paths_2
-
-        contained = False
-        for vv, ww, kk in ug.out_edges(start_node, keys=True):
-            if len(edge_to_cpath.get( (vv, ww, kk), [] )) > 1:
-                contained = True
-
-        if not contained:
-            compound_paths_3[k] = val
-            if DEBUG_LOG_LEVEL > 1:
-                print "compound", k
-
-    compound_paths = {}
-    for s, v, t in compound_paths_3:
-        rs = reverse_end(t)
-        rt = reverse_end(s)
-        if (rs, "NA", rt) not in compound_paths_3:
-            continue
-        compound_paths[ (s, v, t) ] = compound_paths_3[ (s, v, t) ]
-
-    return compound_paths
-
-def parse_args(argv):
-
-    parser = argparse.ArgumentParser(description='a string graph assembler that is desinged for handling diploid genomes')
-    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
-    parser.add_argument('--min_len', type=int, default=4000,
-                        help='minimum length of the reads to be considered for assembling')
-    parser.add_argument('--min_idt', type=float, default=96,
-                        help='minimum alignment identity of the reads to be considered for assembling')
-    parser.add_argument('--lfc', action="store_true", default=False,
-                        help='use local flow constraint method rather than best overlap method to resolve knots in string graph')
-    args = parser.parse_args(argv[1:])
-    return args
-
-def main(argv=sys.argv):
-    args = parse_args(argv)
-
-    # transitivity reduction, remove spurs, remove putative edges caused by repeats
-    sg, sg_r, edge_data = generate_string_graph(args)
-
-
-    simple_paths = {}
-    dual_path = {}
-
-
-    sg2 = nx.DiGraph()
-
-    for v, w in edge_data:
-
-        assert (reverse_end(w), reverse_end(v)) in edge_data
-
-        #if (v, w) in masked_edges:
-        #    continue
-
-        rid, sp, tp, length, score, identity, type_ = edge_data[ (v, w) ]
-        if type_ != "G":
-            continue
-
-        label = "%s:%d-%d" % (rid, sp, tp)
-        sg2.add_edge( v, w, label = label, length = length, score = score)
-
-
-    # utg construction phase 1, identify all simple paths
-    s_nodes = set()
-    t_nodes = set()
-    simple_nodes = set()
-
-    all_nodes = sg2.nodes()
-    for n in all_nodes:
-        in_degree = len( sg2.in_edges(n) )
-        out_degree = len( sg2.out_edges(n) )
-        if in_degree == 1 and out_degree == 1:
-            simple_nodes.add(n)
-        else:
-            if out_degree != 0:
-                s_nodes.add(n)
-            if in_degree != 0:
-                t_nodes.add(n)
-
-
-    free_edges = set(sg2.edges())
-
-    if DEBUG_LOG_LEVEL > 1:
-        for s in list(simple_nodes):
-            print "simple_node", s
-        for s in list(s_nodes):
-            print "s_node", s
-        for s in list(t_nodes):
-            print "t_node", s
-
-        for v,w in free_edges:
-            if (reverse_end(w), reverse_end(v) ) not in free_edges:
-                print "bug", v,w
-                print oreverse_end(w), reverse_end(v)
-
-    while len(free_edges) != 0:
-        if len(s_nodes) != 0:
-            n = s_nodes.pop()
-            if DEBUG_LOG_LEVEL > 1:
-                print "initial utg 1", n
-        else:
-            e = free_edges.pop()
-            free_edges.add(e)
-            n = e[0]
-            if DEBUG_LOG_LEVEL > 1:
-                print "initial utg 2", n
-
-        path = []
-        path_length =0
-        path_score = 0
-        for v, w in sg2.out_edges(n):
-            if (v, w) not in free_edges:
-                continue
-            rv = reverse_end(v)
-            rw = reverse_end(w)
-
-            path_length = 0
-            path_score = 0
-            v0 = v
-            w0 = w
-            path = [v, w]
-            path_edges = set()
-            path_edges.add( (v, w) )
-            path_length += edge_data[ (v, w) ][3]
-            path_score += edge_data[ (v, w) ][4]
-            free_edges.remove( (v, w) )
-
-            r_path_length = 0
-            r_path_score = 0
-            rv0 = rv
-            rw0 = rw
-            r_path = [rv, rw] # need to reverse again
-            r_path_edges = set()
-            r_path_edges.add( (rw, rv) )
-            r_path_length += edge_data[ (rw, rv) ][3]
-            r_path_score += edge_data[ (rw, rv) ][4]
-            free_edges.remove( (rw, rv) )
-
-            while w in simple_nodes:
-                w, w_ = sg2.out_edges(w)[0]
-                if (w, w_) not in free_edges:
-                    break
-                rw_, rw = reverse_end(w_), reverse_end(w)
-
-                if ( rw_, rw ) in path_edges:
-                    break
-
-                path.append(w_)
-                path_edges.add( (w, w_) )
-                path_length += edge_data[ (w, w_) ][3]
-                path_score += edge_data[ (w, w_) ][4]
-                free_edges.remove( (w, w_) )
-
-                r_path.append(rw_)
-                r_path_edges.add( (rw_, rw) )
-                r_path_length += edge_data[ (rw_, rw) ][3]
-                r_path_score += edge_data[ (rw_, rw) ][4]
-                free_edges.remove( (rw_, rw) )
-
-
-                w = w_
-
-            simple_paths[ (v0, w0, path[-1]) ] = path_length, path_score, path
-            r_path.reverse()
-            assert r_path[0] == reverse_end(path[-1])
-            simple_paths[ (r_path[0], rw0, rv0) ] = r_path_length, r_path_score, r_path
-
-            if DEBUG_LOG_LEVEL > 1:
-                print  path_length, path_score, path
-
-            dual_path[ (r_path[0], rw0, rv0) ] = (v0, w0, path[-1])
-            dual_path[ (v0, w0, path[-1]) ] = (r_path[0], rw0, rv0)
-
-
-
-    ug = nx.MultiDiGraph()
-    u_edge_data = {}
-    circular_path = set()
-
-    for s, v, t in simple_paths:
-        length, score, path = simple_paths[ (s, v, t) ]
-        u_edge_data[ (s, t, v) ] = (length, score, path, "simple")
-        if s != t:
-            ug.add_edge(s, t, key = v, type_ = "simple", via = v, length = length, score = score)
-        else:
-            circular_path.add( (s, t, v) )
-
-
-    if DEBUG_LOG_LEVEL > 1:
-        with open("utg_data0","w") as f:
-            for s, t, v in u_edge_data:
-                rs = reverse_end(t)
-                rt = reverse_end(s)
-                rv = reverse_end(v)
-                assert (rs, rt, rv) in u_edge_data
-                length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
-
-                if type_ == "compound":
-                    path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
-                else:
-                    path_or_edges = "~".join( path_or_edges )
-                print >>f, s, v, t, type_, length, score, path_or_edges
-
-    # identify spurs in the utg graph
-    # Currently, we use ad-hoc logic filtering out shorter utg, but we ca
-    # add proper alignment comparison later to remove redundant utgs
-
-    utg_spurs = set()
-    all_nodes = ug.nodes()
-
-    ug2 = ug.copy()
-
-    s_candidates = set( )
-    for v in ug2.nodes():
-        if ug2.in_degree(v) == 0:
-            s_candidates.add(v)
-
-    while len(s_candidates) > 0:
-        n = s_candidates.pop()
-        if ug2.in_degree(n) != 0:
-            continue
-        n_ego_graph = nx.ego_graph( ug2, n, radius = 10 )
-        n_ego_node_set = set( n_ego_graph.nodes() )
-        for b_node in n_ego_graph.nodes():
-            if ug2.in_degree(b_node) <= 1:
-                continue
-
-            with_extern_node = False
-            b_in_nodes = [e[0] for e in ug2.in_edges(b_node)]
-
-            if len(b_in_nodes) == 1:
-                continue
-
-            for v in  b_in_nodes:
-                if v not in n_ego_node_set:
-                    with_extern_node = True
-                    break
-
-            if not with_extern_node:
-                continue
-
-            s_path = nx.shortest_path( ug2, n, b_node )
-            v1 = s_path[0]
-            total_length = 0
-            for v2 in s_path[1:]:
-                for s, t, v in ug2.out_edges(v1, keys=True):
-                    if t != v2:
-                       continue
-                    length, score, edges, type_ = u_edge_data[ (s, t, v) ]
-                    total_length += length
-                v1 = v2
-
-            if total_length >= 50000:
-                continue
-
-            v1 = s_path[0]
-            for v2 in s_path[1:]:
-                for s, t, v in ug2.out_edges(v1, keys=True):
-                    if t != v2:
-                       continue
-                    length, score, edges, type_ = u_edge_data[ (s, t, v) ]
-                    rs = reverse_end(t)
-                    rt = reverse_end(s)
-                    rv = reverse_end(v)
-                    try:
-                        ug2.remove_edge( s, t, key= v)
-                        ug2.remove_edge( rs, rt, key= rv)
-                        u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
-                        u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
-                    except Exception:
-                        pass
-
-                if ug2.in_edges(v2) == 0:
-                    s_candidates.add(v2)
-                v1 = v2
-            break
-
-    #phase 2, finding all "consistent" compound paths
-    compound_paths = construct_compound_paths(ug2, u_edge_data)
-    compound_path_file = open("c_path","w")
-
-    ug2_edges = set(ug2.edges(keys = True))
-    edges_to_remove  = set()
-    for s, v, t in compound_paths:
-        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ]
-        print >> compound_path_file, s,v,t, width, length, score, "|".join( [e[0]+"~"+e[2]+"~"+e[1] for e in bundle_edges] )
-        for ss, tt, vv in bundle_edges:
-            if (ss, tt, vv) in ug2_edges:
-                edges_to_remove.add( (ss, tt, vv) )
-
-
-    for s, t, v in edges_to_remove:
-        ug2.remove_edge( s, t ,v )
-        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
-        if type_ != "spur":
-            u_edge_data[ (s, t, v) ] = length, score, edges, "contained"
-
-
-    for s, v, t in compound_paths:
-        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ]
-        u_edge_data[ (s, t, v) ] = (length, score, bundle_edges, "compound")
-        ug2.add_edge( s, t, key = v, via = v, type_="compound", length = length, score = score)
-
-        assert v == "NA"
-        rs = reverse_end(t)
-        rt = reverse_end(s)
-        assert (rs, v, rt) in compound_paths
-        dual_path[ (s, v, t) ] = (rs, v, rt)
-        dual_path[ (rs, v, rt) ] = (s, v, t)
-
-    compound_path_file.close()
-
-
-    # remove short utg using local flow consistent rule
-    """
-      short UTG like this can be removed, this kind of utg are likely artifects of repeats
-      >____           _____>
-           \__UTG_>__/
-      <____/         \_____<
-    """
-    ug_edge_to_remove = set()
-    for s, t, v in ug.edges(keys=True):
-        if ug2.in_degree(s) == 1 and ug2.out_degree(s) == 2 and \
-           ug2.in_degree(t) == 2 and ug2.out_degree(t) == 1:
-            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
-            if length < 60000:
-                rs = reverse_end(t)
-                rt = reverse_end(s)
-                rv = reverse_end(v)
-                ug_edge_to_remove.add( (s, t, v) )
-                ug_edge_to_remove.add( (rs, rt, rv) )
-    for s, t, v in list(ug_edge_to_remove):
-        ug2.remove_edge(s, t, key=v)
-        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
-        u_edge_data[ (s, t, v) ] = length, score, edges, "repeat_bridge"
-
-    ug = ug2
-
-    with open("utg_data","w") as f:
-        for s, t, v in u_edge_data:
-            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
-
-            if v == "NA":
-                path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
-            else:
-                path_or_edges = "~".join( path_or_edges )
-            print >>f, s, v, t, type_, length, score, path_or_edges
-
-    # contig construction from utgs
-
-    s_nodes = set()
-    t_nodes = set()
-    simple_nodes = set()
-    simple_out = set()
-    simple_in = set()
-
-    all_nodes = ug.nodes()
-    for n in all_nodes:
-        in_degree = len( ug.in_edges(n) )
-        out_degree = len( ug.out_edges(n) )
-        if in_degree == 1 and out_degree == 1:
-            simple_nodes.add(n)
-        else:
-            if out_degree != 0:
-                s_nodes.add(n)
-            if in_degree != 0:
-                t_nodes.add(n)
-        if out_degree == 1:
-            simple_out.add(n)
-        if in_degree == 1:
-            simple_in.add(n)
-
-    all_nodes = set(all_nodes)
-    c_path = []
-
-    free_edges = set()
-    for s, t, v in ug.edges(keys=True):
-        free_edges.add( (s, t, v) )
-
-    while len(free_edges) != 0:
-
-        if len(s_nodes) != 0:
-            n = s_nodes.pop()
-        else:
-            e = free_edges.pop()
-            n = e[0]
-
-        for s, t, v in ug.out_edges(n, keys=True):
-            path_start = n
-            path_end = None
-            path_key = None
-            path = []
-            path_length = 0
-            path_score = 0
-            path_nodes = set()
-            path_nodes.add(s)
-            if DEBUG_LOG_LEVEL > 1:
-                print "check 1", s, t, v
-            path_key = t
-            t0 = s
-            while t in simple_out:
-                if t in path_nodes:
-                    break
-                rt = reverse_end(t)
-                if rt in path_nodes:
-                    break
-
-                path.append( (t0, t, v) )
-                path_nodes.add(t)
-                length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
-                path_length += length
-                path_score += score
-                assert len( ug.out_edges( t, keys=True ) ) == 1 # t is "simple_out" node
-                t0, t, v = ug.out_edges( t, keys=True )[0]
-
-            path.append( (t0, t, v) )
-            length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
-            path_length += length
-            path_score += score
-            path_nodes.add(t)
-            path_end = t
-
-            c_path.append( (path_start, path_key, path_end, path_length, path_score, path, len(path)) )
-            if DEBUG_LOG_LEVEL > 1:
-                print "c_path", path_start, path_key, path_end, path_length, path_score, len(path)
-            for e in path:
-                if e in free_edges:
-                    free_edges.remove( e )
-
-    if DEBUG_LOG_LEVEL > 1:
-        print "left over edges:", len(free_edges)
-
-
-
-    free_edges = set()
-    for s, t, v in ug.edges(keys=True):
-        free_edges.add( (s, t, v) )
-
-
-    ctg_id = 0
-
-    ctg_paths = open("ctg_paths","w")
-
-    c_path.sort( key=lambda x: -x[3] )
-
-
-    for path_start, path_key, path_end, p_len, p_score, path, n_edges in c_path:
-        length = 0
-        score = 0
-        length_r = 0
-        score_r = 0
-
-        non_overlapped_path = []
-        non_overlapped_path_r = []
-        for s, t, v in path:
-            if v != "NA":
-                rs, rt, rv = reverse_end(t), reverse_end(s), reverse_end(v)
-            else:
-                rs, rt, rv = reverse_end(t), reverse_end(s), "NA"
-            if (s, t, v) in free_edges and (rs, rt, rv) in free_edges:
-                non_overlapped_path.append( (s,t,v) )
-                non_overlapped_path_r.append( (rs, rt, rv)  )
-                length += u_edge_data[ (s, t, v) ][0]
-                score += u_edge_data[ (s, t, v) ][1]
-                length_r += u_edge_data[ (rs, rt, rv) ][0]
-                score_r += u_edge_data[ (rs, rt, rv) ][1]
-            else:
-                break
-
-        if len(non_overlapped_path) == 0:
-            continue
-        s0, t0, v0 = non_overlapped_path[0]
-        end_node = non_overlapped_path[-1][1]
-
-        print >> ctg_paths, "%06dF" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length, score, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path ] )
-        non_overlapped_path_r.reverse()
-        s0, t0, v0 = non_overlapped_path_r[0]
-        end_node = non_overlapped_path_r[-1][1]
-        print >> ctg_paths, "%06dR" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length_r, score_r, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path_r ] )
-        ctg_id += 1
-        for e in non_overlapped_path:
-            if e in free_edges:
-                free_edges.remove(e)
-        for e in non_overlapped_path_r:
-            if e in free_edges:
-                free_edges.remove(e)
-
-
-
-    for s, t, v in list(circular_path):
-        length, score, path, type_ = u_edge_data[ (s, t, v) ]
-        print >> ctg_paths, "%6d" % ctg_id, "ctg_circular", s+"~"+v+"~"+t, t, length, score, s+"~"+v+"~"+t
-        ctg_id += 1
-
-    ctg_paths.close()
diff --git a/FALCON_unzip/falcon_unzip/phasing.py b/FALCON_unzip/falcon_unzip/phasing.py
deleted file mode 100644
index 6bd3151..0000000
--- a/FALCON_unzip/falcon_unzip/phasing.py
+++ /dev/null
@@ -1,570 +0,0 @@
-from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
-        makePypeLocalFile, fn, PypeTask)
-from falcon_kit.FastaReader import FastaReader
-import argparse
-import logging
-import os
-import re
-import shlex
-import subprocess
-import sys
-
-cigar_re = r"(\d+)([MIDNSHP=X])"
-
-def make_het_call(self):
-
-    bam_fn = fn(self.bam_file)
-    ctg_id = self.parameters["ctg_id"]
-    ref_seq = self.parameters["ref_seq"]
-    base_dir = self.parameters["base_dir"]
-    vmap_fn = fn(self.vmap_file)
-    vpos_fn = fn(self.vpos_file)
-    q_id_map_fn = fn(self.q_id_map_file)
-
-    p = subprocess.Popen(shlex.split("samtools view %s %s" % (bam_fn, ctg_id) ), stdout=subprocess.PIPE)
-    pileup = {}
-    q_id_map = {}
-    q_max_id = 0
-    q_id = 0
-    q_name_to_id = {}
-
-    try:
-        os.makedirs("%s/%s" % (base_dir, ctg_id))
-    except OSError:
-        pass
-
-    vmap = open(vmap_fn, "w")
-    vpos = open(vpos_fn, "w")
-
-    for l in p.stdout:
-        l = l.strip().split()
-        if l[0][0] == "@":
-            continue
-
-        QNAME = l[0]
-        if QNAME not in q_name_to_id:
-            q_id = q_max_id
-            q_name_to_id[QNAME] = q_id
-            q_max_id += 1
-
-        q_id = q_name_to_id[QNAME]
-        q_id_map[q_id] = QNAME
-        FLAG = int(l[1])
-        RNAME = l[2]
-        POS = int(l[3]) - 1 # convert to zero base
-        CIGAR = l[5]
-        SEQ = l[9]
-        rp = POS
-        qp = 0
-
-        skip_base = 0
-        total_aln_pos = 0
-        for m in re.finditer(cigar_re, CIGAR):
-            adv = int(m.group(1))
-            total_aln_pos += adv
-
-            if m.group(2)  == "S":
-                skip_base += adv
-
-        if 1.0 - 1.0 * skip_base / total_aln_pos < 0.1:
-            continue
-        if total_aln_pos < 2000:
-            continue
-
-        for m in re.finditer(cigar_re, CIGAR):
-            adv = int(m.group(1))
-            if m.group(2) == "S":
-                qp += adv
-            if m.group(2) == "M":
-                matches = []
-                for i in range(adv):
-                    matches.append( (rp, SEQ[qp]) )
-                    rp += 1
-                    qp += 1
-                matches = matches[1:-1]
-                for pos, b in  matches:
-                    pileup.setdefault(pos, {})
-                    pileup[pos].setdefault(b, [])
-                    pileup[pos][b].append(q_id)
-            elif m.group(2) == "I":
-                for i in range(adv):
-                    qp += 1
-            elif m.group(2) == "D":
-                for i in range(adv):
-                    rp += 1
-
-        pos_k = pileup.keys()
-        pos_k.sort()
-        th = 0.25
-        for pos in pos_k:
-            if pos < POS:
-                if len(pileup[pos]) < 2:
-                    del pileup[pos]
-                    continue
-                base_count = []
-                total_count = 0
-                for b in ["A", "C", "G", "T"]:
-                    count = len(pileup[pos].get(b,[]))
-                    base_count.append( (count, b) )
-                    total_count += count
-                if total_count < 10:
-                    del pileup[pos]
-                    continue
-
-                base_count.sort()
-                base_count.reverse()
-                p0 = 1.0 *  base_count[0][0] / total_count
-                p1 = 1.0 *  base_count[1][0] / total_count
-                if p0 < 1.0 - th and p1 > th:
-                    b0 = base_count[0][1]
-                    b1 = base_count[1][1]
-                    ref_base = ref_seq[pos]
-                    print >> vpos, pos+1, ref_base, total_count, " ".join(["%s %d" % (x[1], x[0]) for x in base_count])
-                    for q_id_ in pileup[pos][b0]:
-                        print >> vmap, pos+1, ref_base, b0, q_id_
-                    for q_id_ in pileup[pos][b1]:
-                        print >> vmap, pos+1, ref_base, b1, q_id_
-                del pileup[pos]
-
-
-    q_id_map_f = open(q_id_map_fn, "w")
-    for q_id, q_name in q_id_map.items():
-        print >> q_id_map_f, q_id, q_name
-
-
-def generate_association_table(self):
-
-    vmap_fn = fn(self.vmap_file)
-    atable_fn = fn(self.atable_file)
-    ctg_id = self.parameters["ctg_id"]
-    base_dir = self.parameters["base_dir"]
-
-    vmap = {}
-    v_positions = []
-
-    with open(vmap_fn) as f:
-        for l in f:
-            l = l.strip().split()
-            pos = int(l[0])
-            ref_b = l[1]
-            v_b = l[2]
-            q_id = int(l[3])
-            if (pos, ref_b) not in vmap:
-                v_positions.append( (pos, ref_b) )
-            vmap.setdefault( (pos, ref_b), {} )
-            vmap[ (pos, ref_b) ].setdefault(v_b, [])
-            vmap[ (pos, ref_b) ][v_b].append( q_id )
-
-
-    #xary = []
-    #yary = []
-    with open(atable_fn, "w") as out_f:
-        for i1 in xrange(len(v_positions)):
-            link_count = 0
-            for i2 in xrange(i1+1, len(v_positions)):
-                pos1, rb1 = v_positions[i1]
-                pos2, rb2 = v_positions[i2]
-                if pos2 - pos1 > (1 << 16):
-                    continue
-                ct = {}
-                p1table = []
-                p2table = []
-                s1 = 0
-                list1 = vmap[ (pos1, rb1) ].items()
-                for b1, qids1 in list1:
-                    p1table.append( (b1, len(qids1) ) )
-                    s1 += len(qids1)
-
-                s2 = 0
-                list2 = vmap[ (pos2, rb2) ].items()
-                for b2, qids2 in list2:
-                    p2table.append( (b2, len(qids2) ) )
-                    s2 += len(qids2)
-
-                total_s = 0
-                for b1, qids1 in list1:
-                    for b2, qids2 in list2:
-                        s = len(set(qids1) & set(qids2))
-                        ct[(b1,b2)] = s
-                        total_s += s
-                if total_s < 6:
-                    continue
-
-                b11 = p1table[0][0]
-                b12 = p1table[1][0]
-                b21 = p2table[0][0]
-                b22 = p2table[1][0]
-                print >> out_f, pos1, b11, b12, pos2, b21, b22, ct[(b11,b21)], ct[(b11,b22)], ct[(b12,b21)], ct[(b12,b22)]
-
-
-                #xary.append(pos1)
-                #yary.append(pos2)
-                link_count += 1
-                if link_count > 500:
-                    break
-
-def get_score( c_score, pos1, pos2, s1, s2 ):
-    if pos1 > pos2:
-        pos1, pos2 = pos2, pos1
-        s1, s2 = s2, s1
-    b11, b12 = s1
-    b21, b22 = s2
-    return c_score[ (pos1, pos2) ][ (b11+b21, b12+b22) ]
-
-def get_phased_blocks(self):
-    vmap_fn = fn(self.vmap_file)
-    atable_fn = fn(self.atable_file)
-    p_variant_fn = fn(self.phased_variant_file)
-
-    left_connect = {}
-    right_connect = {}
-
-    c_score = {}
-    states = {}
-    positions = set()
-
-
-
-    ref_base = {}
-    with open(vmap_fn) as f:
-        for l in f:
-            l = l.strip().split()
-            pos = int(l[0])
-            ref_b = l[1]
-            v_b = l[2]
-            q_id = int(l[3])
-            ref_base[pos] = ref_b
-
-    with open(atable_fn) as f:
-        for l in f:
-            l = l.strip().split()
-            pos1, b11, b12, pos2, b21, b22, s11, s12, s21, s22 = l
-            s11, s12, s21, s22 = int(s11), int(s12), int(s21), int(s22)
-            if abs(s11+s22-s12-s21) < 6:
-                continue
-            pos1 = int(pos1)
-            pos2 = int(pos2)
-            positions.add(pos1)
-            positions.add(pos2)
-            right_connect.setdefault(pos1, [])
-            right_connect[pos1].append(pos2)
-            left_connect.setdefault(pos2, [])
-            left_connect[pos2].append(pos1)
-            c_score[ (pos1, pos2) ] = { (b11+b21, b12+b22): s11 + s22, (b12+b22, b11+b21): s11 + s22,
-                                        (b12+b21, b11+b22): s12 + s21, (b11+b22, b12+b21): s12 + s21 }
-
-
-            if pos1 not in states:
-                st1 = (b11, b12)
-                st2 = (b12, b11)
-                score1 = 0
-                score2 = 0
-                for pp in left_connect.get(pos1,[]):
-                    if pp in states:
-                        st0 = states[pp]
-                    else:
-                        continue
-                    score1 += get_score( c_score, pp, pos1, st0, st1 )
-                    score2 += get_score( c_score, pp, pos1, st0, st2 )
-
-                for pp in right_connect.get(pos1,[]):
-                    if pp in states:
-                        st0 = states[pp]
-                    else:
-                        continue
-                    score1 += get_score( c_score, pos1, pp, st1, st0 )
-                    score2 += get_score( c_score, pos1, pp, st2, st0 )
-
-                if score1 >= score2:
-                    states[pos1] = st1
-                else:
-                    states[pos1] = st2
-
-            if pos2 not in states:
-                st1 = (b21, b22)
-                st2 = (b22, b21)
-                score1 = 0
-                score2 = 0
-                for pp in left_connect.get(pos2,[]):
-                    if pp in states:
-                        st0 = states[pp]
-                    else:
-                        continue
-                    score1 += get_score( c_score, pp, pos2, st0, st1 )
-                    score2 += get_score( c_score, pp, pos2, st0, st2 )
-
-                for pp in right_connect.get(pos2,[]):
-                    if pp in states:
-                        st0 = states[pp]
-                    else:
-                        continue
-                    score1 += get_score( c_score, pos2, pp, st1, st0 )
-                    score2 += get_score( c_score, pos2, pp, st2, st0 )
-
-                if score1 >= score2:
-                    states[pos2] = st1
-                else:
-                    states[pos2] = st2
-
-    positions = list(positions)
-    positions.sort()
-
-
-    iter_count = 0
-    while 1:
-        iter_count += 1
-        if iter_count > 10:
-            break
-        update_count = 0
-        for p in positions:
-            b1, b2 = states[p]
-            st1 = (b1, b2)
-            st2 = (b2, b1)
-
-            score1 = 0
-            score2 = 0
-            for pp in left_connect.get(p,[]):
-                st0 = states[pp]
-                score1 += get_score( c_score, pp, p, st0 ,st1)
-                score2 += get_score( c_score, pp, p, st0, st2)
-
-            #for pp in right_connect.get(p,[]):
-            #    st0 = states[pp]
-            #    score1 += get_score( c_score, p, pp, st1 ,st0)
-            #    score2 += get_score( c_score, p, pp, st2, st0)
-
-            if score1 >= score2:
-                states[p] = st1
-            else:
-                states[p] = st2
-                update_count += 1
-        if update_count == 0:
-            break
-
-
-    right_extent = {}
-    right_score = {}
-    left_extent = {}
-    left_score = {}
-
-
-    for p in positions:
-
-        left_extent[p] = p
-        left_score[p] = 0
-        if p in left_connect:
-            left = p
-            st0 = states[p]
-            st0_ = st0[1], st0[0]
-            for pp in left_connect[p]:
-                st1 = states[pp]
-                s = get_score( c_score, pp, p, st1, st0)
-                s_ = get_score( c_score, pp, p, st1, st0_)
-                left_score[p] += s - s_
-                if s - s_ > 0 and pp < left:
-                    left = pp
-            left_extent[p] = left
-
-        right_extent[p] = p
-        right_score[p] = 0
-        if p in right_connect:
-            right = p
-            st0 = states[p]
-            st0_ = st0[1], st0[0]
-            for pp in right_connect[p]:
-                st1 = states[pp]
-                s = get_score( c_score, p, pp, st0, st1)
-                s_ = get_score( c_score, p, pp, st0_, st1)
-                right_score[p] += s - s_
-                if s - s_ > 0 and pp > right:
-                    right = pp
-            right_extent[p] = right
-
-
-
-
-    phase_block_id = 1
-    phase_blocks = {}
-    pb = []
-
-    max_right_ext = 0
-    for p in positions:
-        if right_score[p] < 10 or left_score[p] < 10:
-            continue
-        b1, b2 = states[p]
-        if max_right_ext < left_extent[p]:
-            if len(pb) > 3:
-                phase_blocks[phase_block_id] = pb
-                phase_block_id += 1
-            pb = []
-        pb.append( (p, b1, b2) )
-        if right_extent[p] > max_right_ext:
-            max_right_ext =  right_extent[p]
-    if len(pb) > 3:
-        phase_blocks[phase_block_id] = pb
-    else:
-        phase_block_id -= 1
-
-
-    with open(p_variant_fn, "w") as out_f:
-        for pid in xrange(1, phase_block_id+1):
-            if len(phase_blocks[pid]) == 0:
-                continue
-            min_ = min( [x[0] for x in phase_blocks[pid]] )
-            max_ = max( [x[0] for x in phase_blocks[pid]] )
-
-            print >>out_f, "P", pid, min_, max_, max_ - min_, len(phase_blocks[pid]), 1.0 * (max_-min_)/len(phase_blocks[pid])
-            for p, b1, b2 in phase_blocks[pid]:
-                rb = ref_base[p]
-                print >>out_f, "V", pid, p, "%d_%s_%s" % (p,rb,b1), "%d_%s_%s" % (p,rb,b2), left_extent[p], right_extent[p], left_score[p], right_score[p]
-
-def get_phased_reads(self):
-
-    q_id_map_fn = fn(self.q_id_map_file)
-    vmap_fn = fn(self.vmap_file)
-    p_variant_fn = fn(self.phased_variant_file)
-    parameters = self.parameters
-
-    ctg_id = parameters["ctg_id"]
-
-    phased_read_fn = fn(self.phased_read_file)
-
-    rid_map = {}
-    with open(q_id_map_fn) as f:
-        for l in f:
-            l = l.strip().split()
-            rid_map[int(l[0])] = l[1]
-
-
-    read_to_variants = {}
-    variant_to_reads = {}
-    with open(vmap_fn) as f:
-        for l in f:
-            l = l.strip().split()
-            variant = "_".join(l[:3])
-            read_id = int(l[3])
-            read_to_variants.setdefault(read_id, set())
-            read_to_variants[read_id].add(variant)
-            variant_to_reads.setdefault(variant, set())
-            variant_to_reads[variant].add(read_id)
-
-
-    variant_to_phase = {}
-    with open(p_variant_fn) as f:
-        for l in f:
-            """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781"""
-            l = l.strip().split()
-            if l[0] != "V":
-                continue
-            pb_id = int(l[1])
-            variant_to_phase[ l[3] ] = (pb_id, 0)
-            variant_to_phase[ l[4] ] = (pb_id, 1)
-
-    with open(phased_read_fn, "w") as out_f:
-        for r in read_to_variants:
-            vl = {}
-            pl = set()
-            for v in list( read_to_variants[r] ):
-                if v in variant_to_phase:
-                    p = variant_to_phase[v]
-                    vl[ p ] = vl.get(p, 0) + 1
-                    pl.add(p[0])
-            pl = list(pl)
-            pl.sort()
-            for p in pl:
-                if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1:
-                    print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
-                elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1:
-                    print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
-
-def phasing(args):
-    bam_fn = args.bam
-    fasta_fn = args.fasta
-    ctg_id = args.ctg_id
-    base_dir = args.base_dir
-
-    ref_seq = ""
-    for r in FastaReader(fasta_fn):
-        rid = r.name.split()[0]
-        if rid != ctg_id:
-            continue
-        ref_seq = r.sequence.upper()
-
-    wf = PypeProcWatcherWorkflow(
-            max_jobs=1,
-    )
-
-    bam_file = makePypeLocalFile(bam_fn)
-    vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
-    vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
-    q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
-    parameters = {}
-    parameters["ctg_id"] = ctg_id
-    parameters["ref_seq"] = ref_seq
-    parameters["base_dir"] = base_dir
-
-    make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
-                         outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
-                         parameters = parameters,
-    ) (make_het_call)
-
-    wf.addTasks([make_het_call_task])
-
-
-
-
-    atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
-    parameters = {}
-    parameters["ctg_id"] = ctg_id
-    parameters["base_dir"] = base_dir
-    generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
-                                      outputs = { "atable_file": atable_file },
-                                      parameters = parameters,
-    ) (generate_association_table)
-
-    wf.addTasks([generate_association_table_task])
-
-
-
-
-    phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
-    get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
-                                      outputs = { "phased_variant_file": phased_variant_file },
-    ) (get_phased_blocks)
-    wf.addTasks([get_phased_blocks_task])
-
-
-
-
-    phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
-    get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
-                                                 "q_id_map_file": q_id_map_file,
-                                                 "phased_variant_file": phased_variant_file },
-                                      outputs = { "phased_read_file": phased_read_file },
-                                      parameters = {"ctg_id": ctg_id},
-    ) (get_phased_reads)
-    wf.addTasks([get_phased_reads_task])
-
-
-    wf.refreshTargets()
-    #with open("fc_phasing_wf.dot", "w") as f:
-    #    print >>f, wf.graphvizDot
-
-def parse_args(argv):
-    parser = argparse.ArgumentParser(description='phasing variants and reads from a bam file')
-    # we can run this in parallel mode in the furture
-    #parser.add_argument('--n_core', type=int, default=4,
-    #                    help='number of processes used for generating consensus')
-    parser.add_argument('--bam', type=str, help='path to sorted bam file', required=True)
-    parser.add_argument('--fasta', type=str, help='path to the fasta file of contain the contig', required=True)
-    parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
-    parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
-
-
-    args = parser.parse_args(argv[1:])
-    return args
-
-def main(argv=sys.argv):
-    logging.basicConfig()
-    args = parse_args(argv)
-    phasing(args)
diff --git a/FALCON_unzip/falcon_unzip/phasing_readmap.py b/FALCON_unzip/falcon_unzip/phasing_readmap.py
deleted file mode 100644
index 2e30c4b..0000000
--- a/FALCON_unzip/falcon_unzip/phasing_readmap.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import argparse
-import logging
-import os
-import re
-import sys
-
-
-def get_phasing_readmap(args):
-
-    phased_reads = args.phased_reads
-    read_map_dir = args.read_map_dir
-    the_ctg_id = args.ctg_id
-    base_dir = args.base_dir
-
-    rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids')
-    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')
-    rid_to_oid = open(rawread_id_file).read().split('\n')  #daligner raw read id to the original ids
-    pid_to_fid = open(pread_id_file).read().split('\n')  #daligner pread id to the fake ids
-
-    def pid_to_oid(pid):
-        fid = pid_to_fid[int(pid)]
-        rid = int(fid.split('/')[1])/10
-        return rid_to_oid[int(rid)]
-
-    rid_to_oid = open(rawread_id_file).read().split('\n')  #daligner raw read id to the original ids
-    pid_to_fid = open(pread_id_file).read().split('\n')  #daligner pread id to the fake ids
-
-
-    rid_to_phase = {}
-    with open(phased_reads) as f:
-        for row in f:
-            row = row.strip().split()
-            rid_to_phase[row[6]] = (int(row[2]), int(row[3]))
-
-    arid_to_phase = {}
-    map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
-    with open(map_fn) as f:
-        for row in f:
-            row = row.strip().split()
-            ctg_id = row[1]
-            if not ctg_id.startswith(the_ctg_id):
-                continue
-            if int(row[3]) != 0: #not the best hit
-                continue
-            o_id = pid_to_oid(row[0])
-            phase = rid_to_phase.get( o_id, (-1, 0) )
-            arid_to_phase['%09d' % int(row[0])] = phase
-
-    with open(os.path.join(base_dir, 'rid_to_phase.%s' % the_ctg_id),'w') as f:
-        for arid, phase in arid_to_phase.items():
-            print >>f, arid, the_ctg_id, phase[0], phase[1]
-
-
-
-def parse_args(argv):
-    parser = argparse.ArgumentParser(description='mapping internal daligner read id to phase block and phase',
-            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    # we can run this in parallel mode in the furture
-    #parser.add_argument('--n_core', type=int, default=4,
-    #                    help='number of processes used for generating consensus')
-    parser.add_argument('--phased_reads', type=str, help='path to read vs. phase map', required=True)
-    parser.add_argument('--read_map_dir', type=str, help='path to the read map directory', required=True)
-    parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
-    parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
-
-    args = parser.parse_args(argv[1:])
-    return args
-
-
-def main(argv=sys.argv):
-    logging.basicConfig()
-    args = parse_args(argv)
-    get_phasing_readmap(args)
diff --git a/FALCON_unzip/falcon_unzip/rr_hctg_track.py b/FALCON_unzip/falcon_unzip/rr_hctg_track.py
deleted file mode 100644
index fa02434..0000000
--- a/FALCON_unzip/falcon_unzip/rr_hctg_track.py
+++ /dev/null
@@ -1,201 +0,0 @@
-from __future__ import print_function
-from falcon_kit.multiproc import Pool
-import falcon_kit.util.io as io
-import argparse
-import sys
-import glob
-import os
-from heapq import heappush, heappop, heappushpop
-
-Reader = io.CapturedProcessReaderContext
-
-
-def get_rid_to_ctg(fn):
-    rid_to_ctg = {}
-    with open(fn) as f:
-        for row in f:
-            row = row.strip().split()
-            pid, rid, oid, ctg = row
-            rid_to_ctg.setdefault( rid, set()  )
-            rid_to_ctg[ rid ].add(ctg)
-    return rid_to_ctg
-
-def run_tr_stage1(db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase):
-    cmd = 'LA4Falcon -m %s %s' % (db_fn, fn)
-    reader = Reader(cmd)
-    with reader:
-        return fn, tr_stage1(reader.readlines, min_len, bestn, rid_to_ctg, rid_to_phase)
-
-def tr_stage1(readlines, min_len, bestn, rid_to_ctg, rid_to_phase):
-    """
-    for each read in the b-read column inside the LAS files, we
-    keep top `bestn` hits with a priority queue through all overlaps
-    """
-
-    rtn = {}
-    for l in readlines():
-        l = l.strip().split()
-        q_id, t_id = l[:2]
-        overlap_len = -int(l[2])
-        idt = float(l[3])
-        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
-        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
-        if t_l < min_len:
-            continue
-        if q_id not in rid_to_ctg:
-            continue
-
-        t_phase = rid_to_phase[ int(t_id) ]
-        if t_phase != None:
-            ctg_id, block, phase = t_phase
-            if block != -1:
-                q_phase = rid_to_phase[ int(q_id) ]
-                if q_phase != None:
-                    if q_phase[0] == ctg_id and q_phase[1] == block and q_phase[2] != phase:
-                        continue
-
-        rtn.setdefault(t_id, [])
-        if len(rtn[t_id]) < bestn:
-            heappush(rtn[t_id], (overlap_len, q_id) )
-        else:
-            heappushpop(rtn[t_id], (overlap_len, q_id) )
-
-    return rtn
-
-def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
-    io.LOG('preparing tr_stage1')
-    io.logstats()
-    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
-    hasm_dir = os.path.abspath(os.path.join(base_dir, '3-unzip'))
-    quiver_dir = os.path.abspath(os.path.join(base_dir, '4-quiver'))
-    rid_to_ctg = get_rid_to_ctg(os.path.join(quiver_dir, 'read_maps', 'read_to_contig_map' ))
-
-    phased_read_file = os.path.join(hasm_dir, 'all_phased_reads')
-    oid_to_phase = {}
-    with open(phased_read_file) as f:
-        for row in f:
-            row = row.strip().split()
-            ctg_id, block, phase = row[1:4]
-            oid = row[6]
-            block = int(block)
-            phase = int(phase)
-            oid_to_phase[ oid ] = (ctg_id, block, phase)
-    rid_to_phase = {}
-    rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
-    rid_to_oid = open(rawread_ids_fn).read().split('\n')
-    rid_to_phase = [ None ] * len( rid_to_oid )
-    for rid, oid in enumerate(rid_to_oid):
-        rid_to_phase[rid] = oid_to_phase.get( oid, None )
-
-
-    inputs = []
-    for fn in file_list:
-        inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) )
-    """
-    Aggregate hits from each individual LAS and keep the best n hit.
-    Note that this does not guarantee that the final results is globally the best n hits espcially
-    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
-    file, then we will miss some good  hits.
-    """
-    bread_to_areads = {}
-    for fn, res in exe_pool.imap(io.run_func, inputs):
-        for k in res:
-            bread_to_areads.setdefault(k, [])
-            for item in res[k]:
-                if len(bread_to_areads[k]) < bestn:
-                    heappush( bread_to_areads[k], item )
-                else:
-                    heappushpop( bread_to_areads[k], item )
-
-    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n')
-
-    """
-    For each b-read, we find the best contig map throgh the b->a->contig map.
-    """
-    with open(os.path.join(quiver_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f:
-        for bread in bread_to_areads:
-
-            ctg_score = {}
-            for s, rid in bread_to_areads[bread]:
-                if rid not in rid_to_ctg: continue
-
-                ctgs = rid_to_ctg[rid]
-                for ctg in ctgs:
-                    ctg_score.setdefault(ctg, [0,0])
-                    ctg_score[ctg][0] += -s
-                    ctg_score[ctg][1] += 1
-
-            #oid = rid_to_oid[int(bread)]
-            ctg_score = ctg_score.items()
-            ctg_score.sort( key = lambda k: k[1][0] )
-            rank = 0
-
-            for ctg, score_count in ctg_score:
-                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
-                    in_ctg = 1
-                else:
-                    in_ctg = 0
-                score, count = score_count
-                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
-                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
-                rank += 1
-
-
-
-def try_run_track_reads(n_core, base_dir, min_len, bestn):
-    io.LOG('starting track_reads')
-
-    rawread_dir = os.path.abspath(os.path.join(base_dir, '0-rawreads'))
-
-    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
-    file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las'))
-    io.LOG('file list: %r' % file_list)
-
-    # same, shoud we decide this as a parameter
-    db_fn = os.path.join(rawread_dir, 'raw_reads.db')
-    n_core = min(n_core, len(file_list))
-    exe_pool = Pool(n_core)
-    try:
-        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
-        io.LOG('finished track_reads')
-    except:
-        io.LOG('terminating track_reads workers...')
-        exe_pool.terminate()
-        raise
-
-def track_reads(n_core, base_dir, min_len, bestn, debug, silent, stream):
-    if debug:
-        n_core = 0
-        silent = False
-    if silent:
-        io.LOG = io.write_nothing
-    if stream:
-        global Reader
-        Reader = io.StreamedProcessReaderContext
-
-    try_run_track_reads(n_core, base_dir, min_len, bestn)
-
-def parse_args(argv):
-    parser = argparse.ArgumentParser(description='scan the raw read overlap information to identify the best hit from the reads \
-to the contigs with read_to_contig_map generated by `fc_get_read_hctg_map` in `4-quiver/read_maps/read_to_contig_map`. Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
-            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--n_core', type=int, default=4,
-                        help='number of processes used for for tracking reads; '
-                        '0 for main process only')
-    #parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
-    #parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
-    parser.add_argument('--base_dir', type=str, default="./", help='the base working dir of a FALCON assembly')
-    parser.add_argument('--min_len', type=int, default=2500, help='min length of the reads')
-    parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
-    parser.add_argument('--debug', '-g', action='store_true', help='single-threaded, plus other aids to debugging')
-    parser.add_argument('--silent', action='store_true', help='suppress cmd reporting on stderr')
-    parser.add_argument('--bestn', type=int, default=40, help='keep best n hits')
-    args = parser.parse_args(argv[1:])
-    return args
-
-def main(argv=sys.argv):
-    args = parse_args(argv)
-    track_reads(**vars(args))
-
-if __name__ == '__main__':
-    main()
diff --git a/FALCON_unzip/falcon_unzip/run_quiver.py b/FALCON_unzip/falcon_unzip/run_quiver.py
deleted file mode 100644
index 4eb20e6..0000000
--- a/FALCON_unzip/falcon_unzip/run_quiver.py
+++ /dev/null
@@ -1,394 +0,0 @@
-from falcon_kit import run_support as support
-from pypeflow.simple_pwatcher_bridge import (
-        PypeLocalFile, makePypeLocalFile, fn,
-        PypeTask,
-        PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
-from falcon_kit.FastaReader import FastaReader
-import glob
-import json
-import logging
-import os
-import pprint
-import re
-import sys
-import time
-import ConfigParser
-
-
-LOG = logging.getLogger(__name__)
-
-def system(call, check=False):
-    LOG.debug('$(%s)' %repr(call))
-    rc = os.system(call)
-    msg = 'Call %r returned %d.' % (call, rc)
-    if rc:
-        LOG.warning(msg)
-        if check:
-            raise Exception(msg)
-    else:
-        LOG.debug(msg)
-    return rc
-
-def mkdir(d):
-    if not os.path.isdir(d):
-        os.makedirs(d)
-
-def task_track_reads(self):
-    job_done = fn(self.job_done)
-    wd = self.parameters['wd']
-    config = self.parameters['config']
-    input_bam_fofn = config['input_bam_fofn']
-    sge_track_reads = config['sge_track_reads']
-    script_dir = os.path.join(wd)
-    script_fn = os.path.join(script_dir, 'track_reads_h.sh')
-
-    script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-cd {wd}
-hostname
-date
-fc_get_read_hctg_map.py --basedir ../..
-fc_rr_hctg_track.py --base_dir ../..
-mkdir -p 4-quiver/reads/
-fc_select_reads_from_bam.py --basedir ../.. {input_bam_fofn}
-date
-touch {job_done}
-""".format(**locals())
-
-    with open(script_fn,'w') as script_file:
-        script_file.write(script)
-    self.generated_script_fn = script_fn
-    #job_data['sge_option'] = sge_track_reads
-
-
-def task_run_quiver(self):
-
-    ref_fasta = fn(self.ref_fasta)
-    read_sam = fn(self.read_sam)
-
-    cns_fasta = fn(self.cns_fasta)
-    cns_fastq = fn(self.cns_fastq)
-    job_done = fn(self.job_done)
-
-    job_uid = self.parameters['job_uid']
-    wd = self.parameters['wd']
-    config = self.parameters['config']
-    ctg_id = self.parameters['ctg_id']
-
-    smrt_bin = config['smrt_bin']
-    sge_quiver = config['sge_quiver']
-    job_type = config['job_type']
-    samtools = os.path.join(smrt_bin, 'samtools')
-    pbalign = os.path.join(smrt_bin, 'pbalign')
-    makePbi = os.path.join(smrt_bin, 'makePbi')
-    variantCaller = os.path.join(smrt_bin, 'variantCaller')
-
-    script_dir = os.path.join(wd)
-    script_fn =  os.path.join(script_dir , 'cns_%s.sh' % (ctg_id))
-
-    script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-hostname
-date
-cd {wd}
-
-{samtools} faidx {ref_fasta}
-{samtools} view -b -S {read_sam} > {ctg_id}.bam
-{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\
-          --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\
-          --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\
-            {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam
-#{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam
-({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\
-            -o {cns_fasta} -o {cns_fastq}) || echo quvier failed
-date
-touch {job_done}
-""".format(**locals())
-
-    with open(script_fn,'w') as script_file:
-        script_file.write(script)
-    self.generated_script_fn = script_fn
-    #job_data['sge_option'] = sge_quiver
-
-def rm(f):
-    system('rm -f {}'.format(f))
-def touch(f):
-    system('touch {}'.format(f))
-def task_cns_zcat(self):
-    gathered_p_ctg_fn = fn(self.gathered_p_ctg)
-    gathered_h_ctg_fn = fn(self.gathered_h_ctg)
-
-    cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta)
-    cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq)
-    cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta)
-    cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq)
-    job_done_fn = fn(self.job_done)
-
-    rm(cns_p_ctg_fasta_fn)
-    touch(cns_p_ctg_fasta_fn)
-    rm(cns_p_ctg_fastq_fn)
-    touch(cns_p_ctg_fastq_fn)
-    with open(gathered_p_ctg_fn) as ifs:
-        for line in ifs:
-            cns_fasta_fn, cns_fastq_fn = line.split()
-            system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format(**locals()))
-            system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format(**locals()))
-    with open(gathered_p_ctg_fn) as ifs:
-        for line in ifs:
-            cns_fasta_fn, cns_fastq_fn = line.split()
-            rm(cns_fasta_fn)
-            rm(cns_fasta_fn)
-    rm(cns_h_ctg_fasta_fn)
-    touch(cns_h_ctg_fasta_fn)
-    rm(cns_h_ctg_fastq_fn)
-    touch(cns_h_ctg_fastq_fn)
-    with open(gathered_h_ctg_fn) as ifs:
-        for line in ifs:
-            cns_fasta_fn, cns_fastq_fn = line.split()
-            system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format(**locals()))
-            system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format(**locals()))
-    with open(gathered_h_ctg_fn) as ifs:
-        for line in ifs:
-            cns_fasta_fn, cns_fastq_fn = line.split()
-            rm(cns_fasta_fn)
-            rm(cns_fasta_fn)
-
-    touch(job_done_fn)
-
-def task_scatter_quiver(self):
-    p_ctg_fn = fn(self.p_ctg_fa)
-    h_ctg_fn = fn(self.h_ctg_fa)
-    out_json = fn(self.scattered_quiver_json)
-
-    ref_seq_data = {}
-
-    # I think this will crash if the file is empty. Maybe that is ok.
-    p_ctg_fa = FastaReader(p_ctg_fn)
-    ctg_types = {}
-    for r in p_ctg_fa:
-        rid = r.name.split()[0]
-        ref_seq_data[rid] = r.sequence
-        ctg_types[rid] = 'p'
-
-
-    # I think this will crash if the file is empty. Maybe that is ok.
-    h_ctg_fa = FastaReader(h_ctg_fn)
-    for r in h_ctg_fa:
-        rid = r.name.split()[0]
-        ref_seq_data[rid] = r.sequence
-        ctg_types[rid] = 'h'
-
-    ctg_ids = sorted(ref_seq_data.keys())
-    #p_ctg_out=[]
-    #h_ctg_out=[]
-    #job_done_plfs = {}
-    jobs = []
-    for ctg_id in ctg_ids:
-        sequence = ref_seq_data[ctg_id]
-        m_ctg_id = ctg_id.split('-')[0]
-        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
-        ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
-        read_sam = os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id))
-        #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
-        #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
-        #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
-
-        if os.path.exists(read_sam): # TODO(CD): Ask Jason what we should do if missing SAM. And what about network latency?
-            #if ctg_types[ctg_id] == 'p':
-            #    p_ctg_out.append( (cns_fasta, cns_fastq) )
-            #if ctg_types[ctg_id] == 'h':
-            #    h_ctg_out.append( (cns_fasta, cns_fastq) )
-            mkdir(wd)
-            if not os.path.exists(fn(ref_fasta)):
-                with open(fn(ref_fasta),'w') as f:
-                    print >>f, '>'+ctg_id
-                    print >>f, sequence
-            #parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
-            #make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam},
-            #                           outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
-            #                           parameters = parameters,
-            #                           )
-            #quiver_task = make_quiver_task(task_run_quiver)
-            #wf.addTask(quiver_task)
-            #job_done_plfs['{}'.format(ctg_id)] = job_done
-            new_job = {}
-            new_job['ctg_id'] = ctg_id
-            jobs.append(new_job)
-    open(out_json, 'w').write(json.dumps(jobs))
-
-def create_quiver_jobs(scattered_quiver_plf):
-    scattered_quiver_fn = fn(scattered_quiver_plf)
-    jobs = json.loads(open(scattered_quiver_fn).read())
-    #ctg_ids = sorted(jobs['ref_seq_data'])
-    p_ctg_out=[]
-    h_ctg_out=[]
-    job_done_plfs = {}
-    for job in jobs:
-        ctg_id = job['ctg_id']
-        m_ctg_id = ctg_id.split('-')[0]
-        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
-        ref_fasta = makePypeLocalFile(os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)))
-        read_sam = makePypeLocalFile(os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id)))
-        cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
-        cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
-        job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
-
-        if os.path.exists(fn(read_sam)): # TODO(CD): Ask Jason what we should do if missing SAM.
-            if ctg_types[ctg_id] == 'p':
-                p_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
-            elif ctg_types[ctg_id] == 'h':
-                h_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
-            else:
-                LOG.warning('Type is {!r}, not "p" or "h". Why are we running Quiver?'.format(ctg_types[ctg_id]))
-            parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
-            make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam,
-                                         'scattered_quiver': scattered_quiver_plf,
-                                       },
-                                       outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
-                                       parameters = parameters,
-            )
-            quiver_task = make_quiver_task(task_run_quiver)
-            wf.addTask(quiver_task)
-            job_done_plfs['{}'.format(ctg_id)] = job_done
-    #sge_quiver = config['sge_quiver']
-    return p_ctg_out, h_ctg_out, job_done_plfs
-
-def task_gather_quiver(self):
-    """We wrote the "gathered" files during task construction.
-    """
-    job_done_fn = fn(self.job_done)
-    touch(job_done_fn)
-
-
-def main(argv=sys.argv):
-    global LOG
-    LOG = support.setup_logger(None)
-
-
-    if len(sys.argv) < 2:
-        print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
-        sys.exit(1)
-
-    config_fn = sys.argv[1]
-    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))
-
-    config = ConfigParser.ConfigParser()
-    config.read(config_fn)
-
-
-    job_type = 'SGE'
-    if config.has_option('General', 'job_type'):
-        job_type = config.get('General', 'job_type')
-
-    sge_track_reads = ' -pe smp 12 -q bigmem'
-    if config.has_option('Unzip', 'sge_track_reads'):
-        sge_track_reads = config.get('Unzip', 'sge_track_reads')
-
-    sge_quiver = ' -pe smp 24 -q bigmem '
-    if config.has_option('Unzip', 'sge_quiver'):
-        sge_quiver = config.get('Unzip', 'sge_quiver')
-
-    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
-    if config.has_option('Unzip', 'smrt_bin'):
-        smrt_bin = config.get('Unzip', 'smrt_bin')
-
-    input_bam_fofn = 'input_bam.fofn'
-    if config.has_option('Unzip', 'input_bam_fofn'):
-        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
-    if not os.path.isabs(input_bam_fofn):
-        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)
-
-
-    quiver_concurrent_jobs = 8
-    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
-        quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs')
-
-    config = {'job_type': job_type,
-              'sge_quiver': sge_quiver,
-              'sge_track_reads': sge_track_reads,
-              'input_bam_fofn': input_bam_fofn,
-              'smrt_bin': smrt_bin}
-    LOG.info('config={}'.format(pprint.pformat(config)))
-
-    #support.job_type = 'SGE' #tmp hack until we have a configuration parser
-
-
-    wf = PypeProcWatcherWorkflow(
-            max_jobs=quiver_concurrent_jobs,
-    )
-
-    abscwd = os.path.abspath('.')
-    parameters = {'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'), 'config': config}
-    hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention
-    track_reads_h_done_plf = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_h_done'))
-    make_track_reads_task = PypeTask(inputs = {'hasm_done': hasm_done_plf},
-                                     outputs = {'job_done': track_reads_h_done_plf},
-                                     parameters = parameters,
-    )
-    track_reads_task = make_track_reads_task(task_track_reads)
-    #sge_track_reads = config['sge_track_reads']
-
-    wf.addTask(track_reads_task)
-
-    scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json')
-    make_task = PypeTask(
-            inputs = {
-                'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
-                'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
-                'track_reads_h_done': track_reads_h_done_plf,
-            },
-            outputs = {
-                'scattered_quiver_json': scattered_quiver_plf,
-            },
-            parameters = {},
-    )
-    wf.addTask(make_task(task_scatter_quiver))
-    wf.refreshTargets()
-
-    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(scattered_quiver_plf)
-
-    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
-    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
-    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
-    mkdir('4-quiver/cns_gather')
-    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
-        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
-            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
-    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
-        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
-            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
-
-    make_task = PypeTask(
-            inputs = job_done_plfs,
-            outputs = {
-                'job_done': gather_done_plf,
-            },
-            parameters = {},
-    )
-    wf.addTask(make_task(task_gather_quiver))
-    wf.refreshTargets()
-
-    cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta')
-    cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq')
-    cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta')
-    cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq')
-    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
-    make_task = PypeTask(
-            inputs = {
-                'gathered_p_ctg': gathered_p_ctg_plf,
-                'gathered_h_ctg': gathered_h_ctg_plf,
-                'gather_done': gather_done_plf,
-            },
-            outputs = {
-                'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
-                'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
-                'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
-                'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
-                'job_done': zcat_done_plf,
-            },
-    )
-    wf.addTask(make_task(task_cns_zcat))
-
-    wf.refreshTargets()
diff --git a/FALCON_unzip/falcon_unzip/select_reads_from_bam.py b/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
deleted file mode 100644
index 03cbde1..0000000
--- a/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pysam
-
-import argparse
-import glob
-import os
-import sys
-
-def select_reads_from_bam(input_bam_fofn_fn, asm_dir, hasm_dir, quiver_dir):
-    read_partition = {}
-    read_to_ctgs = {}
-
-    rawread_to_contigs_fn = os.path.join(quiver_dir, 'read_maps', 'rawread_to_contigs')
-    rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
-    rid_to_oid = open(rawread_ids_fn).read().split('\n')
-    with open(rawread_to_contigs_fn) as f:
-        for row in f:
-            row = row.strip().split()
-            if int(row[3]) >= 1: #keep top one hits
-                continue
-            ctg_id = row[1]
-            if ctg_id == 'NA':
-                continue
-            read_partition.setdefault( ctg_id, set())
-            r_id = row[0]
-            o_id = rid_to_oid[ int(r_id) ]
-            read_partition[ ctg_id ].add(o_id)
-            read_to_ctgs.setdefault(o_id, [])
-            read_to_ctgs[ o_id ].append( (int(row[4]) ,ctg_id) )
-    print "num read_partitions:", len(read_partition)
-    print "num read_to_ctgs:", len(read_to_ctgs)
-
-    header = None
-    fofn_basedir = os.path.normpath(os.path.dirname(input_bam_fofn_fn))
-    def abs_fn(maybe_rel_fn):
-        if os.path.isabs(maybe_rel_fn):
-            return maybe_rel_fn
-        else:
-            return os.path.join(fofn_basedir, maybe_rel_fn)
-    for row in open(input_bam_fofn_fn):
-        fn = abs_fn(row.strip())
-        samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
-        if header == None:
-            header = samfile.header
-        else:
-            header['RG'].extend(samfile.header['RG'])
-        samfile.close()
-
-    PG = header.pop('PG') #remove PG line as there might be a bug that generates no readable chrs
-    #print PG
-
-    #base_dir = os.getcwd()
-    #outfile = pysam.AlignmentFile( os.path.join(base_dir, 'header.sam' ), 'wh', header=header )
-    #outfile.close()
-
-    ctgs = read_partition.keys()
-    ctgs.sort()
-    selected_ctgs = set()
-    for ctg in ctgs:
-        picked_reads = read_partition[ ctg ]
-        print "ctg, len:", ctg, len(picked_reads) # TODO: Is this for debugging?
-        if len(picked_reads) > 20:
-            selected_ctgs.add(ctg)
-
-    outfile = {}
-
-    sam_dir = os.path.join(quiver_dir, 'reads')
-    mkdir(sam_dir)
-    for row in open(input_bam_fofn_fn):
-        fn = abs_fn(row.strip())
-        samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
-        for r in samfile.fetch( until_eof = True ):
-            if r.query_name not in read_to_ctgs:
-                #print "Missing:", r.query_name
-                continue
-            ctg_list = read_to_ctgs[ r.query_name ]
-            ctg_list.sort()
-            score, ctg = ctg_list[0]
-            if ctg not in selected_ctgs:
-                #print "Not selected:", ctg
-                continue
-            if ctg not in outfile:
-                samfile_fn = os.path.join(quiver_dir, 'reads', '%s.sam' % ctg)
-                print >>sys.stderr, 'samfile_fn:{!r}'.format(samfile_fn)
-                outfile[ctg] = pysam.AlignmentFile(samfile_fn, 'wh', header=header)
-            outfile[ctg].write(r)
-        samfile.close()
-
-    for ctg in outfile:
-        outfile[ctg].close()
-
-def mkdir(d):
-    if not os.path.isdir(d):
-        os.makedirs(d)
-
-def parse_args(argv):
-    parser = argparse.ArgumentParser(description='Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
-              formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--basedir', type=str, default='./', help='the base working dir of a FALCON assembly')
-    parser.add_argument('input_bam_fofn', type=str, help='File of BAM filenames. Paths are relative to dir of FOFN, not CWD.')
-    args = parser.parse_args(argv[1:])
-    return args
-
-def main(argv=sys.argv):
-    args = parse_args(argv)
-    basedir = args.basedir
-    input_bam_fofn = args.input_bam_fofn
-    #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
-    #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
-    asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
-    hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
-    quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
-
-    select_reads_from_bam(input_bam_fofn, asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/unzip.py b/FALCON_unzip/falcon_unzip/unzip.py
deleted file mode 100644
index e585fbc..0000000
--- a/FALCON_unzip/falcon_unzip/unzip.py
+++ /dev/null
@@ -1,357 +0,0 @@
-from falcon_kit import run_support as support
-from pypeflow.simple_pwatcher_bridge import (
-        PypeLocalFile, makePypeLocalFile, fn,
-        PypeTask,
-        PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
-from falcon_kit.FastaReader import FastaReader
-import glob
-import logging
-import os
-import re
-import sys
-import time
-import ConfigParser
-
-LOG = logging.getLogger(__name__)
-
-def system(call, check=False):
-    LOG.debug('$(%s)' %repr(call))
-    rc = os.system(call)
-    msg = 'Call %r returned %d.' % (call, rc)
-    if rc:
-        LOG.warning(msg)
-        if check:
-            raise Exception(msg)
-    else:
-        LOG.debug(msg)
-    return rc
-
-def mkdir(d):
-    if not os.path.isdir(d):
-        os.makedirs(d)
-
-def task_track_reads(self):
-    job_done = fn(self.job_done)
-    wd = self.parameters['wd']
-    config = self.parameters['config']
-    sge_track_reads = config['sge_track_reads']
-    script_dir = os.path.join(wd)
-    script_fn = os.path.join(script_dir , 'track_reads.sh')
-    topdir = '../..'
-
-    script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-hostname
-date
-cd {topdir}
-python -m falcon_kit.mains.get_read_ctg_map
-python -m falcon_kit.mains.rr_ctg_track
-python -m falcon_kit.mains.pr_ctg_track
-#mkdir -p 3-unzip/reads/
-python -m falcon_kit.mains.fetch_reads
-cd {wd}
-date
-touch {job_done}
-""".format(**locals())
-
-    with open(script_fn,'w') as script_file:
-        script_file.write(script)
-    self.generated_script_fn = script_fn
-    #job_data['sge_option'] = sge_track_reads
-
-
-def task_run_blasr(self):
-    job_done = fn(self.job_done)
-    ref_fasta = fn(self.ref_fasta)
-    read_fasta = fn(self.read_fasta)
-
-    job_uid = self.parameters['job_uid']
-    wd = self.parameters['wd']
-    ctg_id = self.parameters['ctg_id']
-
-    config = self.parameters['config']
-    smrt_bin = config['smrt_bin']
-    sge_blasr_aln = config['sge_blasr_aln']
-    job_type = config['job_type']
-    blasr = os.path.join(smrt_bin, 'blasr')
-    samtools = os.path.join(smrt_bin, 'samtools')
-
-
-    script_dir = os.path.join(wd)
-    script_fn =  os.path.join(script_dir , 'aln_{ctg_id}.sh'.format(ctg_id = ctg_id))
-
-    script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-cd {wd}
-hostname
-date
-cd {wd}
-time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\
- -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\
- -minMatch 12  -nproc 24 -sam -out tmp_aln.sam
-{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted
-{samtools} index {ctg_id}_sorted.bam
-rm tmp_aln.sam
-date
-touch {job_done}
-""".format(**locals())
-
-    with open(script_fn,'w') as script_file:
-        script_file.write(script)
-    self.generated_script_fn = script_fn
-    #job_data['sge_option'] = sge_blasr_aln
-
-
-def task_phasing(self):
-    ref_fasta = fn(self.ref_fasta)
-    aln_bam = fn(self.aln_bam)
-
-    job_done = fn(self.job_done)
-
-    job_uid = self.parameters['job_uid']
-    wd = self.parameters['wd']
-    ctg_id = self.parameters['ctg_id']
-
-    config = self.parameters['config']
-    sge_phasing = config['sge_phasing']
-    job_type = config['job_type']
-
-    script_dir = os.path.join(wd)
-    script_fn =  os.path.join(script_dir , 'p_%s.sh' % (ctg_id))
-
-    script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-cd {wd}
-hostname
-date
-cd {wd}
-fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir ..
-fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads
-date
-touch {job_done}
-""".format(**locals())
-
-    with open(script_fn,'w') as script_file:
-        script_file.write(script)
-    self.generated_script_fn = script_fn
-    #job_data['sge_option'] = sge_phasing
-
-
-def task_hasm(self):
-    rid_to_phase_all = fn(self.rid_to_phase_all)
-    job_done = fn(self.job_done)
-    config = self.parameters['config']
-    sge_hasm = config['sge_hasm']
-
-    wd = self.parameters['wd']
-
-    job_type = config['job_type']
-
-    script_dir = os.path.join(wd)
-    script_fn =  os.path.join(script_dir , 'hasm.sh')
-
-    las_fofn = '../../2-asm-falcon/las.fofn'
-    las_fofn = '../../1-preads_ovl/merge-gather/las.fofn'
-    script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-hostname
-date
-cd {wd}
-
-fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl
-fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
-if [ -e ../../1-preads_ovl/preads4falcon.fasta ];
-then
-  ln -sf ../../1-preads_ovl/preads4falcon.fasta .
-else
-  ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta .
-fi
-fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta
-
-# more script -- a little bit hacky here, we should improve
-
-WD=$PWD
-for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done
-
-## prepare for quviering the haplotig
-cd $WD/..
-if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi
-if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi
-if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi
-if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi
-if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi
-
-find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
-find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
-find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
-find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
-find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
-find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
-cd ../
-date
-touch {job_done}
-""".format(**locals())
-
-    with open(script_fn,'w') as script_file:
-        script_file.write(script)
-    self.generated_script_fn = script_fn
-    #job_data['sge_option'] = sge_hasm
-
-def unzip_all(config):
-    unzip_concurrent_jobs = config['unzip_concurrent_jobs']
-    wf = PypeProcWatcherWorkflow(
-            max_jobs=unzip_concurrent_jobs,
-    )
-    wf.max_jobs = unzip_concurrent_jobs
-
-    ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
-    falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
-    wdir = os.path.abspath('./3-unzip/reads')
-    parameters = {'wd': wdir, 'config': config}
-
-    job_done = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_done'))
-    make_track_reads_task = PypeTask(inputs = {'falcon_asm_done': falcon_asm_done},
-                                     outputs = {'job_done': job_done, 'ctg_list_file': ctg_list_file},
-                                     parameters = parameters,
-                                     wdir = wdir,
-    )
-    track_reads_task = make_track_reads_task(task_track_reads)
-
-    wf.addTask(track_reads_task)
-    wf.refreshTargets() #force refresh now, will put proper dependence later
-
-    ctg_ids = []
-    with open('./3-unzip/reads/ctg_list') as f:
-        for row in f:
-            row = row.strip()
-            ctg_ids.append(row)
-
-    aln1_outs = {}
-
-    all_ctg_out = {}
-
-    for ctg_id in ctg_ids:
-        # inputs
-        ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
-        read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))
-
-        # outputs
-        wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))
-        #mkdir(wd)
-        blasr_dir = os.path.join(wd, 'blasr')
-        ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))
-        job_done = makePypeLocalFile(os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id = ctg_id)))
-
-        parameters = {'job_uid':'aln-'+ctg_id, 'wd': blasr_dir, 'config':config, 'ctg_id': ctg_id}
-        make_blasr_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_fasta': read_fasta},
-                                   outputs = {'ctg_aln_out': ctg_aln_out, 'job_done': job_done},
-                                   parameters = parameters,
-        )
-        blasr_task = make_blasr_task(task_run_blasr)
-        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
-        wf.addTask(blasr_task)
-
-        phasing_dir = os.path.join(wd, 'phasing')
-        job_done = makePypeLocalFile(os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id = ctg_id)))
-        rid_to_phase_out = makePypeLocalFile(os.path.join(wd, 'rid_to_phase.{ctg_id}'.format(ctg_id = ctg_id))) # TODO: ???
-        all_ctg_out[ 'r2p.{ctg_id}'.format(ctg_id = ctg_id) ] = rid_to_phase_out # implicit output?
-
-        parameters = {'job_uid':'ha-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id}
-        make_phasing_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'aln_bam':ctg_aln_out},
-                                   outputs = {'job_done': job_done},
-                                   parameters = parameters,
-        )
-        phasing_task = make_phasing_task(task_phasing)
-        wf.addTask(phasing_task)
-
-    wf.refreshTargets()
-
-    hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
-    #mkdir(hasm_wd)
-    rid_to_phase_all = makePypeLocalFile(os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
-    task = PypeTask(inputs = all_ctg_out, outputs = {'rid_to_phase_all': rid_to_phase_all},
-    ) (get_rid_to_phase_all)
-    wf.addTask(task)
-
-    parameters['wd'] = hasm_wd
-    job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
-    make_hasm_task = PypeTask(inputs = {'rid_to_phase_all': rid_to_phase_all},
-                              outputs = {'job_done': job_done},
-                              parameters = parameters,
-    )
-    hasm_task = make_hasm_task(task_hasm)
-
-    wf.addTask(hasm_task)
-
-    wf.refreshTargets()
-
-def get_rid_to_phase_all(self):
-    # Tasks must be at module scope now.
-    rid_to_phase_all_fn = fn(self.rid_to_phase_all)
-    inputs_fn = [ fn(f) for f in self.inputs.values() ]
-    inputs_fn.sort()
-    output = []
-    for fname in inputs_fn:
-        output.extend(open(fname).read())
-
-    out = open(rid_to_phase_all_fn, 'w')
-    out.write(''.join(output))
-    out.close()
-
-def main(argv=sys.argv):
-    global LOG
-    LOG = support.setup_logger(None)
-
-
-    if len(argv) < 2 or argv[1].startswith('-'):
-        print 'you need to provide a configuration file to specific a couple cluster running environment'
-        sys.exit(1)
-
-    config_fn = argv[1]
-
-    config = ConfigParser.ConfigParser()
-    config.read(config_fn)
-
-    job_type = 'SGE'
-    if config.has_option('General', 'job_type'):
-        job_type = config.get('General', 'job_type')
-
-    sge_blasr_aln = ' -pe smp 24 -q bigmem '
-    if config.has_option('Unzip', 'sge_blasr_aln'):
-        sge_blasr_aln = config.get('Unzip', 'sge_blasr_aln')
-
-    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
-    if config.has_option('Unzip', 'smrt_bin'):
-        smrt_bin = config.get('Unzip', 'smrt_bin')
-
-    sge_phasing = ' -pe smp 12 -q bigmem'
-    if config.has_option('Unzip', 'sge_phasing'):
-        sge_phasing = config.get('Unzip', 'sge_phasing')
-
-    sge_hasm = ' -pe smp 48 -q bigmem'
-    if config.has_option('Unzip', 'sge_hasm'):
-        sge_hasm = config.get('Unzip', 'sge_hasm')
-
-    sge_track_reads = ' -pe smp 12 -q bigmem'
-    if config.has_option('Unzip', 'sge_track_reads'):
-        sge_track_reads = config.get('Unzip', 'sge_track_reads')
-
-    unzip_concurrent_jobs = 8
-    if config.has_option('Unzip', 'unzip_concurrent_jobs'):
-        unzip_concurrent_jobs = config.getint('Unzip', 'unzip_concurrent_jobs')
-
-    config = {'job_type': job_type,
-              'sge_blasr_aln': sge_blasr_aln,
-              'smrt_bin': smrt_bin,
-              'sge_phasing': sge_phasing,
-              'sge_hasm': sge_hasm,
-              'sge_track_reads': sge_track_reads,
-              'unzip_concurrent_jobs':unzip_concurrent_jobs}
-
-    #support.job_type = 'SGE' #tmp hack until we have a configuration parser
-
-    unzip_all(config)
diff --git a/FALCON_unzip/setup.py b/FALCON_unzip/setup.py
deleted file mode 100755
index b54466c..0000000
--- a/FALCON_unzip/setup.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-
-from setuptools import setup
-
-from distutils.core import Extension
-
-import glob
-
-install_requires=[ "networkx >= 1.7", "pysam >= 0.8.4" ]
-
-scripts = glob.glob("src/py_scripts/*.py")
-
-setup(name='falcon_unzip',
-      version='0.1.0',
-      description='Falcon unzip',
-      author='Jason Chin',
-      author_email='jchin at pacificbiosciences.com',
-      packages=['falcon_unzip'],
-      package_dir={'falcon_unzip':'falcon_unzip/'},
-      scripts = scripts,
-      zip_safe = False,
-      install_requires=install_requires
-     )
-
diff --git a/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
deleted file mode 100644
index 71b60f5..0000000
--- a/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.dedup_h_tigs import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py b/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
deleted file mode 100644
index 09af64b..0000000
--- a/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.get_read_hctg_map import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
deleted file mode 100644
index e9fcb1f..0000000
--- a/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.graphs_to_h_tigs import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py b/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
deleted file mode 100644
index c3a38d2..0000000
--- a/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from falcon_unzip.ovlp_filter_with_phase import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
-
-
diff --git a/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py b/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
deleted file mode 100644
index 2c58c5a..0000000
--- a/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from falcon_unzip.phased_ovlp_to_graph import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
-
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing.py b/FALCON_unzip/src/py_scripts/fc_phasing.py
deleted file mode 100644
index 70c4711..0000000
--- a/FALCON_unzip/src/py_scripts/fc_phasing.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from falcon_unzip.phasing import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
-
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py b/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
deleted file mode 100644
index 3fa3592..0000000
--- a/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.phasing_readmap import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_quiver.py b/FALCON_unzip/src/py_scripts/fc_quiver.py
deleted file mode 100644
index 21a5af5..0000000
--- a/FALCON_unzip/src/py_scripts/fc_quiver.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.run_quiver import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py b/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
deleted file mode 100644
index b68b43a..0000000
--- a/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.rr_hctg_track import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py b/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
deleted file mode 100644
index 0e85055..0000000
--- a/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.select_reads_from_bam import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py b/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
deleted file mode 100644
index f84dc30..0000000
--- a/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
+++ /dev/null
@@ -1,398 +0,0 @@
-#from pypeflow.common import *
-#from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
-#from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
-#from pypeflow.controller import PypeWorkflow, PypeMPWorkflow, PypeThreadWorkflow
-from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
-        makePypeLocalFile, fn, PypeTask)
-PypeThreadTaskBase = MyFakePypeThreadTaskBase
-from falcon_kit.FastaReader import FastaReader
-from falcon_kit.fc_asm_graph import AsmGraph
-import glob
-import sys
-import subprocess as sp
-import shlex
-import os
-
-def make_dirs(d):
-    if not os.path.isdir(d):
-        os.makedirs(d)
-
-rawread_dir = os.path.abspath( "./0-rawreads" )
-pread_dir = os.path.abspath( "./1-preads_ovl" )
-asm_dir = os.path.abspath( os.path.join("./3-unzip/") )
-
-read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
-make_dirs(read_map_dir)
-
-wf = PypeProcWatcherWorkflow(
-        max_jobs=12,
-)
-
-rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
-rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) )
-
- at PypeTask( inputs = {"rawread_db": rawread_db}, 
-           outputs =  {"rawread_id_file": rawread_id_file},
-           TaskType = PypeThreadTaskBase,
-           URL = "task://localhost/dump_rawread_ids" )
-def dump_rawread_ids(self):
-    rawread_db = fn( self.rawread_db )
-    rawread_id_file = fn( self.rawread_id_file )
-    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )
-
-wf.addTask( dump_rawread_ids )
-
-pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
-pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )
-
- at PypeTask( inputs = {"pread_db": pread_db}, 
-           outputs =  {"pread_id_file": pread_id_file},
-           TaskType = PypeThreadTaskBase,
-           URL = "task://localhost/dump_pread_ids" )
-def dump_pread_ids(self):
-    pread_db = fn( self.pread_db )
-    pread_id_file = fn( self.pread_id_file )
-    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file) )
-
-wf.addTask( dump_pread_ids )
-wf.refreshTargets() # block
-
-all_raw_las_files = {}
-for las_fn in glob.glob( os.path.join( rawread_dir, "m*/raw_reads.*.las") ):
-    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
-    idx = int(idx.split(".")[1]) 
-    las_file = makePypeLocalFile( las_fn )
-    all_raw_las_files["r_las_%s" % idx] = las_file 
-
-all_pread_las_files = {}
-for las_fn in glob.glob( os.path.join( pread_dir, "m*/preads.*.las") ):
-    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
-    idx = int(idx.split(".")[1]) 
-    las_file = makePypeLocalFile( las_fn )
-    all_pread_las_files["p_las_%s" % idx] = las_file 
-
-
-
-h_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_edges") )
-p_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_p_ctg_edges") )
-h_ctg_ids = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_ids") )
-
-inputs = { "rawread_id_file": rawread_id_file,
-           "pread_id_file": pread_id_file,
-           "h_ctg_edges": h_ctg_edges,
-           "p_ctg_edges": p_ctg_edges,
-           "h_ctg_ids": h_ctg_ids}
-
-read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map") )
-
- at PypeTask( inputs = inputs, 
-           outputs = {"read_to_contig_map": read_to_contig_map}, 
-           TaskType = PypeThreadTaskBase, 
-           URL = "task://localhost/get_ctg_read_map" )
-
-def generate_read_to_ctg_map(self):
-    rawread_id_file = fn( self.rawread_id_file )
-    pread_id_file = fn( self.pread_id_file )
-    read_to_contig_map = fn( self.read_to_contig_map )
-    
-    pread_did_to_rid = open(pread_id_file).read().split("\n")
-    rid_to_oid = open(rawread_id_file).read().split("\n")
-
-    h_ctg_edges = fn( self.h_ctg_edges )
-    p_ctg_edges = fn( self.p_ctg_edges )
-
-    h_ctg_ids = set()
-    with open(fn(self.h_ctg_ids)) as f:
-        for row in f:
-            row = row.strip()
-            h_ctg_ids.add( row )
-
-    pread_to_contigs = {}
-
-    for fnanme in ( p_ctg_edges, h_ctg_edges):
-        with open(fnanme) as f:
-            for row in f:
-                row = row.strip().split()
-                ctg = row[0]
-                if len(ctg.split("_")) > 1 and ctg not in h_ctg_ids:
-                    continue
-                n1 = row[1]
-                n2 = row[2]
-                pid1 = int(n1.split(":")[0])
-                pid2 = int(n2.split(":")[0])
-                rid1 = pread_did_to_rid[pid1].split("/")[1]
-                rid2 = pread_did_to_rid[pid2].split("/")[1]
-                rid1 = int(int(rid1)/10)
-                rid2 = int(int(rid2)/10)
-                oid1 = rid_to_oid[rid1]
-                oid2 = rid_to_oid[rid2]
-                k1 = (pid1, rid1, oid1)
-                pread_to_contigs.setdefault( k1, set() )
-                pread_to_contigs[ k1 ].add( ctg )
-                k2 = (pid2, rid2, oid2)
-                pread_to_contigs.setdefault( k2, set() )
-                pread_to_contigs[ k2 ].add( ctg )
-
-    with open(read_to_contig_map, "w") as f:
-        for k in pread_to_contigs:
-            pid, rid, oid = k
-            for ctg in list(pread_to_contigs[ k ]):
-                print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
-
-wf.addTask( generate_read_to_ctg_map )
-
-def dump_rawread_to_ctg(self):
-    rawread_db = fn(self.rawread_db)
-    rawread_id_file = fn(self.rawread_id_file)
-    phased_read_file = fn(self.phased_reads)
-    las_file = fn(self.las_file)
-    rawread_to_contig_file = fn(self.rawread_to_contig_file)
-    read_to_contig_map = fn(self.read_to_contig_map)
-    rid_to_oid = open(rawread_id_file).read().split('\n')
-
-
-    ovlp_data = []
-    ovlp_count = 0
-    longest_ovlp = 0
-    a_id = None
-    rid_to_contigs = {}
-    
-    with open(read_to_contig_map) as f:
-        for row in f:
-            row = row.strip().split()
-            pid, rid, oid, ctg = row
-            rid = int(rid)
-            rid_to_contigs.setdefault( rid, (oid, set() ) )
-            rid_to_contigs[ rid ][1].add( ctg )
-
-    oid_to_phase = {}
-    with open(phased_read_file) as f:
-        for row in f:
-            row = row.strip().split()
-            ctg_id, block, phase = row[1:4]
-            oid = row[6]
-            block = int(block)
-            phase = int(phase)
-            oid_to_phase[ oid ] = (ctg_id, block, phase)
-
-
-    with open(rawread_to_contig_file, "w") as f:
-        ovlp_data = {}
-        cur_read_id = None
-        skip_rest = 0
-        for row in sp.check_output(shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file)) ).splitlines():
-
-            row = row.strip().split()
-            t_id = int(row[1])
-            q_id = int(row[0])
-            if q_id != cur_read_id:
-                if cur_read_id == None:
-                    cur_read_id = q_id
-                else:
-                    if len(ovlp_data) == 0:
-                        o_id = rid_to_oid[ cur_read_id ]
-                        print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
-                    if len(ovlp_data) != 0:
-                        ovlp_v = ovlp_data.values()
-                        ovlp_v.sort()
-                        rank = 0
-                        for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
-                            print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
-                            rank += 1
-                    ovlp_data = {}
-                    cur_read_id = q_id
-                    skip_rest = 0
-
-            if q_id in rid_to_contigs and len(ovlp_data) == 0: #if the query is already an edge of some contig....
-                t_o_id, ctgs = rid_to_contigs[ q_id ]
-                o_id = rid_to_oid[ q_id ]
-                for ctg in list(ctgs):
-                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
-                    ovlp_data[ctg][0] = -int(row[7]) 
-                    ovlp_data[ctg][1] += 1
-                    skip_rest = 1
-
-            if skip_rest == 1:
-                continue
-
-            if t_id not in rid_to_contigs:
-                continue
-
-            q_phase = oid_to_phase.get( rid_to_oid[q_id], None )
-            if q_phase != None:
-                ctg_id, block, phase = q_phase
-                if block != -1:
-                    t_phase = oid_to_phase.get( rid_to_oid[t_id], None )
-                    if t_phase != None:
-                        if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
-                            continue
-
-            t_o_id, ctgs = rid_to_contigs[ t_id ]
-            o_id = rid_to_oid[ q_id ]
-            
-            for ctg in list(ctgs):
-                ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
-                ovlp_data[ctg][0] += int(row[2])
-                ovlp_data[ctg][1] += 1
-
-        if len(ovlp_data) != 0:
-            ovlp_v = ovlp_data.values()
-            ovlp_v.sort()
-            rank = 0
-            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
-                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
-                rank += 1
-
-def dump_pread_to_ctg(self):
-    pread_db = fn( self.pread_db )
-    rawread_id_file = fn( self.rawread_id_file )
-    pread_id_file = fn( self.pread_id_file )
-    phased_read_file = fn( self.phased_reads)
-    read_to_contig_map = fn( self.read_to_contig_map )
-    las_file = fn( self.las_file )
-    pread_to_contig_file = fn( self.pread_to_contig_file )
-    read_to_contig_map = fn( self.read_to_contig_map )
-    
-    pid_to_rid = open(pread_id_file).read().split("\n")
-    rid_to_oid = open(rawread_id_file).read().split("\n")
-
-
-    ovlp_data = []
-    ovlp_count = 0
-    longest_ovlp = 0
-    a_id = None
-    pid_to_contigs = {}
-    
-    with open(read_to_contig_map) as f:
-        for row in f:
-            row = row.strip().split()
-            pid, rid, oid, ctg = row
-            pid = int(pid)
-            pid_to_contigs.setdefault( pid, (oid, set() ) )
-            pid_to_contigs[ pid ][1].add( ctg )
-            
-    oid_to_phase = {}
-    with open(phased_read_file) as f:
-        for row in f:
-            row = row.strip().split()
-            ctg_id, block, phase = row[1:4]
-            oid = row[6]
-            block = int(block)
-            phase = int(phase)
-            oid_to_phase[ oid ] = (ctg_id, block, phase)
-
-    with open(pread_to_contig_file, "w") as f:
-        ovlp_data = {}
-        cur_read_id = None
-        skip_rest = 0
-        for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file)) ).splitlines():
-
-            row = row.strip().split()
-            t_id = int(row[1])
-            q_id = int(row[0])
-            if q_id != cur_read_id:
-                if cur_read_id == None:
-                    cur_read_id = q_id
-                else:
-                    if len(ovlp_data) == 0:
-                        rid = pid_to_rid[cur_read_id].split("/")[1]
-                        rid = int(int(rid)/10)
-                        o_id = rid_to_oid[ rid ]
-                        print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
-                    else:
-                        ovlp_v = ovlp_data.values()
-                        ovlp_v.sort()
-                        rank = 0
-                        for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
-                            print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
-                            rank += 1
-                    ovlp_data = {}
-                    cur_read_id = q_id
-                    skip_rest = 0
-
-            if q_id in pid_to_contigs and len(ovlp_data) == 0: #if the query is in some contig....
-                t_o_id, ctgs = pid_to_contigs[ q_id ]
-                rid = pid_to_rid[q_id].split("/")[1]
-                rid = int(int(rid)/10)
-                o_id = rid_to_oid[ rid ]
-                for ctg in list(ctgs):
-                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
-                    ovlp_data[ctg][0] = -int(row[7]) 
-                    ovlp_data[ctg][1] += 1
-                skip_rest = 1
-
-            if skip_rest == 1:
-                continue
-
-            if t_id not in pid_to_contigs:
-                continue
-        
-            q_rid = int( int(pid_to_rid[q_id].split("/")[1])/10 )
-            q_phase = oid_to_phase.get( rid_to_oid[ q_rid ], None )
-            
-            if q_phase != None:
-                ctg_id, block, phase = q_phase
-                if block != -1:
-                    t_rid = int( int(pid_to_rid[t_id].split("/")[1])/10 )
-                    t_phase = oid_to_phase.get( rid_to_oid[ t_rid ], None )
-                    if t_phase != None:
-                        if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
-                            continue
-
-            t_o_id, ctgs = pid_to_contigs[ t_id ]
-            rid = pid_to_rid[q_id].split("/")[1]
-            rid = int(int(rid)/10)
-            o_id = rid_to_oid[ rid ]
-            
-            for ctg in list(ctgs):
-                ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
-                ovlp_data[ctg][0] += int(row[2])
-                ovlp_data[ctg][1] += 1
-
-        if len(ovlp_data) != 0:
-            ovlp_v = ovlp_data.values()
-            ovlp_v.sort()
-            rank = 0
-            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
-                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
-                rank += 1
-
-
-phased_reads =  makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads"))
-
-
-for las_key, las_file in all_raw_las_files.items():
-    las_fn = fn(las_file)
-    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
-    idx = int(idx.split(".")[1]) 
-    rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
-    make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file, 
-                                                    "rawread_db": rawread_db, 
-                                                    "read_to_contig_map": read_to_contig_map, 
-                                                    "rawread_id_file": rawread_id_file,
-                                                    "pread_id_file": pread_id_file,
-                                                    "phased_reads" : phased_reads},
-                                      outputs = { "rawread_to_contig_file": rawread_to_contig_file },
-                                      TaskType = PypeThreadTaskBase,
-                                      URL = "task://localhost/r_read_to_contigs.%s" % idx )
-    dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)                           
-    wf.addTask( dump_rawread_to_ctg_task )
-
-for las_key, las_file in all_pread_las_files.items():
-    las_fn = fn(las_file)
-    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
-    idx = int(idx.split(".")[1]) 
-    pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
-    make_dump_pread_to_ctg = PypeTask( inputs = { "las_file": las_file, 
-                                                  "pread_db": pread_db, 
-                                                  "read_to_contig_map": read_to_contig_map, 
-                                                  "rawread_id_file": rawread_id_file,
-                                                  "pread_id_file": pread_id_file,
-                                                  "phased_reads": phased_reads},
-                                      outputs = { "pread_to_contig_file": pread_to_contig_file },
-                                      TaskType = PypeThreadTaskBase,
-                                      URL = "task://localhost/pread_to_contigs.%s" % idx )
-    dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)                           
-    wf.addTask( dump_pread_to_ctg_task )
-
-wf.refreshTargets() # block
diff --git a/FALCON_unzip/src/py_scripts/fc_unzip.py b/FALCON_unzip/src/py_scripts/fc_unzip.py
deleted file mode 100644
index 0e6bb71..0000000
--- a/FALCON_unzip/src/py_scripts/fc_unzip.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.unzip import main
-import sys
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/FALCON_unzip/src/py_utils/align.sh b/FALCON_unzip/src/py_utils/align.sh
deleted file mode 100644
index 2bcb508..0000000
--- a/FALCON_unzip/src/py_utils/align.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-~/bin/blasr_yli_0317 000000F_reads.fa 000000F_ref.fa  -noSplitSubreads -clipping subread  -placeRepeatsRandomly -minPctIdentity 70.0  -minMatch 12  -nproc 48 -bam -out aln.bam
-#~/bin/blasr_yli_0317 ../data/chm1_bam/all_merged.bam pa_ctg.fa  -noSplitSubreads -clipping subread  -placeRepeatsRandomly -minPctIdentity 70.0 -minMatch 12 -nproc 48 -bam -out test_chm1.bam 
-#samtools sort -m 120G test_chm13.bam chm13_aln 
-#samtools sort -m 120G test_chm1.bam chm1_aln 
-#python screen_aln_chm1.py
-#python screen_aln_chm13.py
-#bamtools merge -in chm1_aln_sampled.bam -in chm13_aln_sampled.bam -out chm1_chm13_sampled_merged.bam
-#samtools sort -m 120G chm1_chm13_sampled_merged.bam chm1_chm13_sampled_merged_sorted
-#samtools index chm1_chm13_sampled_merged_sorted.bam
-#variantCaller.py --skipUnrecognizedContigs -v -j40 -r pa_ctg.fa  --algorithm=quiver chm1_chm13_sampled_merged_sorted.bam -o cns.fastq.gz -o cns.fasta.gz
diff --git a/FALCON_unzip/src/py_utils/call_SV_from_mum.py b/FALCON_unzip/src/py_utils/call_SV_from_mum.py
deleted file mode 100644
index ac8e441..0000000
--- a/FALCON_unzip/src/py_utils/call_SV_from_mum.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import sys
-
-mum_fn = sys.argv[1]
-
-def process_mum_data( pread_id, mum_data ):
-    delta = 0
-    pend = 0
-    qend = 0
-    p_delta = 0
-
-
-    rtn = []
-    for rpos, qpos, len_ in mum_data:
-        rpos = int(rpos)
-        qpos = int(qpos)
-        len_ = int(len_)
-        offset = rpos - qpos
-        
-        if pend == 0:
-            rtn.append( ( rpos, qpos, len_, offset, rpos+len_, 0, 0, 0, pread_id) )
-        else:
-            rtn.append( ( rpos, qpos, len_, offset, rpos+len_, offset-p_offset, rpos-pend, qpos-qend, pread_id) )
-        pend = rpos+len_
-        qend = qpos+len_
-        p_offset = offset
-
-    #break into cluster
-    cluster = []
-    p_delta = 0 
-    for d in rtn:
-        rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
-        if q_delta < -10:
-            continue
-        if r_delta < -10:
-            continue
-        if len(cluster) == 0: #start a new cluster
-            cluster.append([])
-            cluster[-1].append( d )
-            #p_delta = delta
-            continue
-        if delta  > 30000:
-            #print delta, p_delta
-            #p_delta = 0
-            cluster.append([])
-            continue
-        cluster[-1].append( d )
-        #p_delta = delta
-    rtn = []
-    for cls_id in xrange(len(cluster)):
-        aln_size = 0
-        qpos_ary = []
-        if len(cluster[cls_id]) == 0:
-            continue
-        for d in cluster[cls_id]:
-            rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
-            aln_size += len_
-            qpos_ary.append(qpos)
-        if 1.0*(max(qpos_ary) - min(qpos_ary))/aln_size < 0.5 or aln_size < 2000:
-            continue
-
-        for d in cluster[cls_id]:
-            rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
-            rtn.append( (rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id) )
-
-
-    return rtn
-
-
-
-mum_data = []
-with open(mum_fn) as mum_file:
-    for row in mum_file:
-        row = row.strip().split()
-        if row[0] == ">":
-            if len(mum_data) > 0:
-                rtn = process_mum_data( pread_id, mum_data )
-                if len(rtn) > 0:
-                    print ">",pread_id
-                    for d in rtn:
-                        print " ".join([str(c) for c in d])
-            pread_id = row[1]
-            mum_data = []
-            #print 
-        else:
-            mum_data.append( row )
diff --git a/FALCON_unzip/src/py_utils/fetch_read.py b/FALCON_unzip/src/py_utils/fetch_read.py
deleted file mode 100644
index 74def7e..0000000
--- a/FALCON_unzip/src/py_utils/fetch_read.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from falcon_kit.FastaReader import FastaReader
-
-import sys
-
-f = FastaReader(sys.argv[1])
-rl = set(open(sys.argv[2]).read().split())
-for r in f:
-    rid = r.name.split()[0]
-    if rid not in rl:
-        continue
-    print ">"+rid
-    print r.sequence
diff --git a/FALCON_unzip/src/py_utils/split_bam.py b/FALCON_unzip/src/py_utils/split_bam.py
deleted file mode 100644
index f5562f0..0000000
--- a/FALCON_unzip/src/py_utils/split_bam.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import subprocess, shlex
-
-
-read_to_phase = {}
-with open("phased_reads") as f:
-    for l in f:
-        l = l.strip().split()
-        rid = l[-1]
-        phase = l[1] + "_" +l[2]
-        read_to_phase[rid] = phase
-
-p = subprocess.Popen(shlex.split("samtools view -h 000207F.000207F000207F.bam" ), stdout=subprocess.PIPE)
-
-phase_to_file = {}
-header = []
-for l in p.stdout:
-    l2 = l.strip().split()
-    if l2[0][0] == "@":
-        header.append(l.strip())
-    QNAME = l2[0]
-    phase = read_to_phase.get(QNAME, None)
-    if phase != None:
-        pfn = ("./phased_sam/%s" % phase) +".sam"
-        if pfn not in phase_to_file:
-            phase_to_file[pfn] = open(pfn,"w")
-            print >> phase_to_file[pfn], "\n".join(header)
-        print >>  phase_to_file[pfn], l.strip()
-
diff --git a/pypeFLOW/pwatcher/fs_based.py b/pypeFLOW/pwatcher/fs_based.py
index e22639f..1a203a8 100755
--- a/pypeFLOW/pwatcher/fs_based.py
+++ b/pypeFLOW/pwatcher/fs_based.py
@@ -286,12 +286,14 @@ class MetaJobSge(object):
         #cwd = os.getcwd()
         job_name = self.get_jobname()
         sge_option = qstripped(self.mjob.job.options['sge_option'])
-        job_queue = self.mjob.job.options['job_queue']
+        if '-q' not in sge_option:
+            job_queue = self.mjob.job.options['job_queue']
+            sge_option = '-q {} '.format(job_queue) + sge_option
         # Add shebang, in case shell_start_mode=unix_behavior.
         #   https://github.com/PacificBiosciences/FALCON/pull/348
         with open(script_fn, 'r') as original: data = original.read()
         with open(script_fn, 'w') as modified: modified.write("#!/bin/bash" + "\n" + data)
-        sge_cmd = 'qsub -N {job_name} -q {job_queue} {sge_option} {specific} -cwd -o stdout -e stderr -S {exe} {script_fn}'.format(
+        sge_cmd = 'qsub -N {job_name} {sge_option} {specific} -cwd -o stdout -e stderr -S {exe} {script_fn}'.format(
                 **locals())
         system(sge_cmd, checked=True) # TODO: Capture q-jobid
     def kill(self, state, heartbeat):
@@ -330,12 +332,14 @@ usage: qsub [-a date_time] [-A account_string] [-c interval]
         #cwd = os.getcwd()
         job_name = self.get_jobname()
         sge_option = qstripped(self.mjob.job.options['sge_option'])
-        job_queue = self.mjob.job.options['job_queue']
+        if '-q' not in sge_option:
+            job_queue = self.mjob.job.options['job_queue']
+            sge_option = '-q {} '.format(job_queue) + sge_option
         # Add shebang, in case shell_start_mode=unix_behavior.
         #   https://github.com/PacificBiosciences/FALCON/pull/348
         with open(script_fn, 'r') as original: data = original.read()
         with open(script_fn, 'w') as modified: modified.write("#!/bin/bash" + "\n" + data)
-        sge_cmd = 'qsub -N {job_name} -q {job_queue} {sge_option} {specific} -o stdout -e stderr -S {exe} {script_fn}'.format(
+        sge_cmd = 'qsub -N {job_name} {sge_option} {specific} -o stdout -e stderr -S {exe} {script_fn}'.format(
                 **locals())
         system(sge_cmd, checked=True) # TODO: Capture q-jobid
     def kill(self, state, heartbeat):
@@ -367,13 +371,15 @@ class MetaJobTorque(object):
         #cwd = os.getcwd()
         job_name = self.get_jobname()
         sge_option = qstripped(self.mjob.job.options['sge_option'])
-        job_queue = self.mjob.job.options['job_queue']
+        if '-q' not in sge_option:
+            job_queue = self.mjob.job.options['job_queue']
+            sge_option = '-q {} '.format(job_queue) + sge_option
         cwd = os.getcwd()
         # Add shebang, in case shell_start_mode=unix_behavior.
         #   https://github.com/PacificBiosciences/FALCON/pull/348
         with open(script_fn, 'r') as original: data = original.read()
         with open(script_fn, 'w') as modified: modified.write("#!/bin/bash" + "\n" + data)
-        sge_cmd = 'qsub -N {job_name} -q {job_queue} {sge_option} {specific} -d {cwd} -o stdout -e stderr -S {exe} {script_fn}'.format(
+        sge_cmd = 'qsub -N {job_name} {sge_option} {specific} -d {cwd} -o stdout -e stderr -S {exe} {script_fn}'.format(
                 **locals())
         system(sge_cmd, checked=True) # TODO: Capture q-jobid
     def kill(self, state, heartbeat):
@@ -402,9 +408,11 @@ class MetaJobSlurm(object):
         """
         job_name = self.get_jobname()
         sge_option = qstripped(self.mjob.job.options['sge_option'])
-        job_queue = self.mjob.job.options['job_queue']
+        if '-p' not in sge_option:
+            job_queue = self.mjob.job.options['job_queue']
+            sge_option = '-p {} '.format(job_queue) + sge_option
         cwd = os.getcwd()
-        sge_cmd = 'sbatch -J {job_name} -p {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
+        sge_cmd = 'sbatch -J {job_name} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
                 **locals())
         # "By default all environment variables are propagated."
         #  http://slurm.schedmd.com/sbatch.html
@@ -434,8 +442,10 @@ class MetaJobLsf(object):
         """
         job_name = self.get_jobname()
         sge_option = qstripped(self.mjob.job.options['sge_option'])
-        job_queue = self.mjob.job.options['job_queue']
-        sge_cmd = 'bsub -J {job_name} -q {job_queue} {sge_option} -o stdout -e stderr "{exe} {script_fn}"'.format(
+        if '-q' not in sge_option:
+            job_queue = self.mjob.job.options['job_queue']
+            sge_option = '-q {} '.format(job_queue) + sge_option
+        sge_cmd = 'bsub -J {job_name} {sge_option} -o stdout -e stderr "{exe} {script_fn}"'.format(
                 **locals())
         # "Sets the user's execution environment for the job, including the current working directory, file creation mask, and all environment variables, and sets LSF environment variables before starting the job."
         system(sge_cmd, checked=True) # TODO: Capture q-jobid
diff --git a/pypeFLOW/pypeflow/do_task.py b/pypeFLOW/pypeflow/do_task.py
index 3e9a868..1366059 100644
--- a/pypeFLOW/pypeflow/do_task.py
+++ b/pypeFLOW/pypeflow/do_task.py
@@ -131,6 +131,10 @@ def run(json_fn, timeout, tmpdir):
     LOG.debug('Loading JSON from {!r}'.format(json_fn))
     cfg = json.loads(open(json_fn).read())
     LOG.debug(pprint.pformat(cfg))
+    rundir = os.path.dirname(json_fn)
+    with cd(rundir):
+        run_cfg_in_tmpdir(cfg, tmpdir)
+def run_cfg_in_tmpdir(cfg, tmpdir):
     for fn in cfg['inputs'].values():
         wait_for(fn)
     inputs = cfg['inputs']
diff --git a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
index e47bb76..5dc4c60 100644
--- a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
+++ b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
@@ -64,7 +64,6 @@ class PwatcherTaskQueue(object):
         #sge_option='-pe smp 8 -q default'
         for node in nodes:
             #node.satisfy() # This would do the job without a process-watcher.
-            mkdirs(node.wdir)
             generated_script_fn = node.execute() # misnomer; this only dumps task.json now
             if not generated_script_fn:
                 raise Exception('Missing generated_script_fn for Node {}'.format(node))
@@ -156,14 +155,29 @@ def get_unsatisfied_subgraph(g):
         if n.satisfied():
             unsatg.remove_node(n)
     return unsatg
-def find_next_ready(g, node):
+def find_all_ancestors(g):
+    ancestors = dict()
+    def get_ancestors(node):
+        """memoized"""
+        if node not in ancestors:
+            myancestors = set()
+            for pred in g.predecessors_iter(node):
+                myancestors.update(get_ancestors(pred))
+            ancestors[node] = myancestors
+        return ancestors[node]
+    for node in g.nodes():
+        get_ancestors(node)
+    return ancestors
+def find_next_ready_and_remove(g, node):
     """Given a recently satisfied node,
     return any successors with in_degree 1.
+    Then remove node from g immediately.
     """
     ready = set()
     for n in g.successors_iter(node):
         if 1 == g.in_degree(n):
             ready.add(n)
+    g.remove_node(node)
     return ready
 def find_all_roots(g):
     """Find all nodes in g which have no predecessors.
@@ -267,8 +281,7 @@ class Workflow(object):
             LOG.info('recently_satisfied: {!r}'.format(recently_satisfied))
             LOG.info('Num satisfied in this iteration: {}'.format(len(recently_satisfied)))
             for node in recently_satisfied:
-                ready.update(find_next_ready(unsatg, node))
-                unsatg.remove_node(node)
+                ready.update(find_next_ready_and_remove(unsatg, node))
             LOG.info('Num still unsatisfied: {}'.format(len(unsatg)))
             if recently_done:
                 msg = 'Some tasks are recently_done but not satisfied: {!r}'.format(recently_done)
@@ -283,7 +296,7 @@ class Workflow(object):
             raise Exception('We had {} failures. {} tasks remain unsatisfied.'.format(
                 failures, len(unsatg)))
 
-    def __init__(self, watcher, job_type, job_queue, max_jobs, use_tmpdir, sge_option=''):
+    def __init__(self, watcher, job_type, job_queue, max_jobs, use_tmpdir, squash, sge_option=''):
         self.graph = networkx.DiGraph()
         self.tq = PwatcherTaskQueue(watcher=watcher, job_type=job_type, job_queue=job_queue, sge_option=sge_option) # TODO: Inject this.
         assert max_jobs > 0, 'max_jobs needs to be set. If you use the "blocking" process-watcher, it is also the number of threads.'
@@ -291,6 +304,8 @@ class Workflow(object):
         self.sentinels = dict() # sentinel_done_fn -> Node
         self.pypetask2node = dict()
         self.use_tmpdir = use_tmpdir
+        self.squash = squash # This really should depend on each Task, but for now a global is fine.
+        # For small genomes, serial tasks should always be squashed.
 
 class NodeBase(object):
     """Graph node.
@@ -353,22 +368,15 @@ touch {sentinel_done_fn}
         self.name = name
         self.wdir = wdir
         self.needs = needs
-class ComboNode(NodeBase):
-    """Several Nodes to be executed in sequence.
-    Only this ComboNode will be in the DiGraph, not the sub-Nodes.
-    """
-    def generate_script(self):
-        raise NotImplementedError(repr(self))
-    def __init__(self, nodes):
-        #super(ComboNode, self).__init__(name, wdir, needs)
-        self.nodes = nodes
 class PypeNode(NodeBase):
     """
     We will clean this up later. For now, it is pretty tightly coupled to PypeTask.
     """
     def generate_script(self):
+        wdir = self.wdir
+        mkdirs(wdir)
         pt = self.pypetask
-        wdir = pt.wdir # For now, assume dir already exists.
+        assert pt.wdir == self.wdir
         inputs = {k:v.path for k,v in pt.inputs.items()}
         outputs = {k:os.path.relpath(v.path, wdir) for k,v in pt.outputs.items()}
         for v in outputs.values():
@@ -391,7 +399,7 @@ env | sort
 pwd
 time {cmd}
 """.format(**locals())
-        script_fn = os.path.join(pt.wdir, 'task.sh')
+        script_fn = os.path.join(wdir, 'task.sh')
         open(script_fn, 'w').write(script_content)
         return script_fn
     def old_generate_script(self):
@@ -481,6 +489,13 @@ def PypeTask(inputs, outputs, parameters=None, wdir=None):
     if wdir is None:
         #wdir = parameters.get('wdir', name) # One of these must be a string!
         wdir = find_work_dir([only_path(v) for v in outputs.values()])
+        # Since we derived wdir from outputs, we need to ensure that they
+        # are not relative to wdir.
+        for k,v in outputs.items():
+            if not isinstance(v, PypeLocalFile) and not os.path.isabs(v):
+                outputs[k] = os.path.relpath(v, wdir)
+    if not os.path.isabs(wdir):
+        wdir = os.path.abspath(wdir)
     this = _PypeTask(inputs, outputs, wdir, parameters)
     #basedir = os.path.basename(wdir)
     basedir = this.name
@@ -533,7 +548,7 @@ class _PypeTask(object):
         URL = 'task://localhost/{}'.format(name)
         self.inputs = inputs
         self.outputs = outputs
-        self.parameters = parameters
+        self.parameters = dict(parameters) # Always copy this, so caller can re-use, for convenience.
         self.wdir = wdir
         self.name = name
         self.URL = URL
@@ -562,6 +577,7 @@ def PypeProcWatcherWorkflow(
         max_jobs = 24, # must be > 0, but not too high
         sge_option = None,
         use_tmpdir = None,
+        squash = False,
         **attributes):
     """Factory for the workflow.
     """
@@ -580,8 +596,13 @@ def PypeProcWatcherWorkflow(
                 use_tmpdir = os.path.abspath(use_tmpdir)
         except (TypeError, AttributeError):
             use_tmpdir = tempfile.gettempdir()
-    LOG.info('job_type={!r}, job_queue={!r}, sge_option={!r}, use_tmpdir={!r}'.format(job_type, job_queue, sge_option, use_tmpdir))
-    return Workflow(watcher, job_type=job_type, job_queue=job_queue, max_jobs=max_jobs, sge_option=sge_option, use_tmpdir=use_tmpdir)
+    LOG.info('job_type={!r}, job_queue={!r}, sge_option={!r}, use_tmpdir={!r}, squash={!r}'.format(
+        job_type, job_queue, sge_option, use_tmpdir, squash,
+    ))
+    return Workflow(watcher,
+            job_type=job_type, job_queue=job_queue, max_jobs=max_jobs, sge_option=sge_option, use_tmpdir=use_tmpdir,
+            squash=squash,
+    )
     #th = MyPypeFakeThreadsHandler('mypwatcher', job_type, job_queue)
     #mq = MyMessageQueue()
     #se = MyFakeShutdownEvent() # TODO: Save pwatcher state on ShutdownEvent. (Not needed for blocking pwatcher. Mildly useful for fs_based.)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falcon.git



More information about the debian-med-commit mailing list