[med-svn] [falcon] 04/15: New upstream version 1.8.8

Afif Elghraoui afif at moszumanska.debian.org
Sat Dec 16 08:03:28 UTC 2017


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository falcon.

commit 6fa9904c346391e416d9372ffd499e22ac9fa5f5
Author: Afif Elghraoui <afif at debian.org>
Date:   Sat Dec 16 01:24:04 2017 -0500

    New upstream version 1.8.8
---
 DALIGNER/DBX.c                                     |   85 ++
 DALIGNER/DBX.h                                     |   25 +
 DALIGNER/GNUmakefile                               |    1 +
 DALIGNER/LA4Falcon.c                               |   58 +-
 DALIGNER/bamboo_build.sh                           |   28 +
 DAZZ_DB/bamboo_build.sh                            |   23 +
 FALCON-examples/git-sym.makefile                   |    6 +-
 FALCON-examples/makefile                           |    2 +-
 FALCON-examples/run/ecoli/fc_run.cfg               |    8 +-
 FALCON-examples/run/greg200k-sv2/fc_run.cfg        |   13 +
 FALCON-examples/run/greg200k-sv2/fc_unzip.cfg      |   14 +-
 FALCON-examples/run/greg200k-sv2/input.fofn        |    4 +-
 FALCON-examples/run/greg200k-sv2/input_bam.fofn    |    2 +
 .../run/synth0/{fc_run.cfg => fc_preads.cfg}       |   26 +-
 FALCON-examples/run/synth0/fc_run.cfg              |   19 +-
 FALCON-examples/run/synth0/makefile                |    4 +-
 FALCON-examples/run/synth0/preads.fofn             |    2 +
 FALCON/.travis.yml                                 |    4 +-
 FALCON/bamboo_build.sh                             |   28 +
 FALCON/bamboo_test.sh                              |   14 +
 FALCON/falcon_kit/FastaReader.py                   |  165 +--
 FALCON/falcon_kit/__init__.py                      |   38 -
 FALCON/falcon_kit/bash.py                          |   19 +-
 FALCON/falcon_kit/falcon_kit.py                    |   47 +-
 FALCON/falcon_kit/fc_asm_graph.py                  |   44 +-
 FALCON/falcon_kit/functional.py                    |   22 +-
 FALCON/falcon_kit/mains/actg_coordinate.py         |   14 +-
 FALCON/falcon_kit/mains/calc_cutoff.py             |   20 +-
 FALCON/falcon_kit/mains/dedup_a_tigs.py            |    6 +-
 FALCON/falcon_kit/mains/fetch_reads.py             |   36 +-
 FALCON/falcon_kit/mains/get_read_ctg_map.py        |    3 +-
 FALCON/falcon_kit/mains/graph_to_contig.py         |    6 +-
 FALCON/falcon_kit/mains/ovlp_filter.py             |    8 +-
 FALCON/falcon_kit/mains/ovlp_to_graph.py           |    3 +-
 FALCON/falcon_kit/mains/run1.py                    |   55 +-
 FALCON/falcon_kit/pype_tasks.py                    |    4 +-
 FALCON/falcon_kit/run_support.py                   |   49 +-
 FALCON/falcon_kit/stats_preassembly.py             |   17 +-
 FALCON/falcon_kit/util/system.py                   |    9 +
 FALCON/makefile                                    |   65 ++
 FALCON/mycoverage.cfg                              |    5 +
 FALCON/mysitecustomize.py                          |    3 +
 FALCON/setup.py                                    |    1 +
 FALCON/src/c/falcon.c                              |   13 +-
 FALCON/src/py_scripts_v0.1/falcon_asm.py           | 1154 ------------------
 FALCON/src/py_scripts_v0.1/falcon_asm_s.py         | 1220 --------------------
 FALCON/src/py_scripts_v0.1/falcon_dedup.py         |  119 --
 FALCON/src/py_scripts_v0.1/falcon_fixasm.py        |  213 ----
 FALCON/src/py_scripts_v0.1/falcon_overlap.py       |  328 ------
 FALCON/src/py_scripts_v0.1/falcon_overlap2.py      |  337 ------
 FALCON/src/py_scripts_v0.1/falcon_qrm.py           |  370 ------
 FALCON/src/py_scripts_v0.1/falcon_qrm_0.py         |  378 ------
 FALCON/src/py_scripts_v0.1/falcon_sense.py         |  248 ----
 FALCON/src/py_scripts_v0.1/falcon_ucns_data.py     |  120 --
 FALCON/src/py_scripts_v0.1/falcon_utgcns.py        |  124 --
 FALCON/src/py_scripts_v0.1/get_ovl.sh              |    7 -
 FALCON/src/py_scripts_v0.1/get_rdata.py            |  207 ----
 FALCON/src/py_scripts_v0.1/overlapper.py           |  216 ----
 FALCON/src/py_scripts_v0.1/ovlp_filter.sh          |    6 -
 FALCON/src/py_scripts_v0.1/redis_graph.py          |   79 --
 FALCON/src/py_scripts_v0.1/remove_dup_ctg.py       |   75 --
 FALCON/test/helpers.py                             |    4 +
 FALCON/test/test_calc_cutoff.py                    |   43 +
 FALCON/test/test_functional.py                     |   14 +
 FALCON/test/test_stats_preassembly.py              |   17 +-
 FALCON/test_data/calc_cutoff/partial_capture.txt   |    5 +
 FALCON/travis.sh                                   |   17 +-
 bamboo_build_and_test.sh                           |   42 +
 makefile                                           |    6 +-
 pypeFLOW/bamboo_build.sh                           |   29 +
 pypeFLOW/makefile                                  |    3 +
 pypeFLOW/pwatcher/blocking.py                      |    4 +-
 pypeFLOW/pwatcher/fs_based.py                      |   54 +-
 pypeFLOW/pwatcher/mains/pypeflow_example.py        |   14 +-
 pypeFLOW/pwatcher/mains/query_server.py            |    3 +-
 pypeFLOW/pwatcher/network_based.py                 |   17 +-
 pypeFLOW/pypeflow/do_task.py                       |   30 +-
 pypeFLOW/pypeflow/simple_pwatcher_bridge.py        |    2 +-
 pypeFLOW/pypeflow/util.py                          |    7 +
 travis.sh                                          |   11 +-
 80 files changed, 873 insertions(+), 5668 deletions(-)

diff --git a/DALIGNER/DBX.c b/DALIGNER/DBX.c
new file mode 100644
index 0000000..2a84fbd
--- /dev/null
+++ b/DALIGNER/DBX.c
@@ -0,0 +1,85 @@
+#include "DBX.h"
+#include "DB.h"
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <assert.h>
+
+// From Jason, with 1 change
+static char* Load_Read_Data(HITS_DB *db) {
+  FILE  *bases  = (FILE*) db->bases;
+  struct stat sbuf;
+  char  *data;
+
+  bases = fopen(Catenate(db->path,"","",".bps"),"r");
+  if (bases == NULL) EXIT(1);
+  stat(Catenate(db->path,"","",".bps"), &sbuf);
+  data = (char *) malloc(sbuf.st_size);
+  if (data == NULL) return NULL; // was EXIT(1), but we can proceed
+  fread(data, sbuf.st_size, 1, bases);
+  fclose(bases);
+  return(data);
+}
+
+// Wrapper
+int Open_DBX(char *path, HITS_DBX *dbx, bool preload) {
+  dbx->data = NULL;
+  int rc = Open_DB(path, &dbx->db);
+  switch (rc) {
+    case -1:
+      return -1;
+    case 0:
+      break;
+    case 1:
+      assert(rc != 1);
+      abort();
+    default:
+      assert(rc < -1 || rc > 1);
+      abort();
+  }
+  if (preload) {
+    dbx->data = Load_Read_Data(&dbx->db);
+  }
+  return 0;
+}
+
+// From Jason
+static int Load_Read_From_RAM(HITS_DB *db, char *data, int i, char *read, int ascii) {
+  int64      off;
+  int        len, clen;
+  HITS_READ *r = db->reads;
+
+  if (i >= db->nreads) { EXIT(1); }
+
+  off = r[i].boff;
+  len = r[i].rlen;
+  clen = COMPRESSED_LEN(len);
+  if (clen > 0) { memcpy(read, data + off, clen); } //fread(read,clen,1,bases)
+  Uncompress_Read(len, read);
+  if (ascii == 1)
+    { Lower_Read(read);
+      read[-1] = '\0';
+    }
+  else if (ascii == 2)
+    { Upper_Read(read);
+      read[-1] = '\0';
+    }
+  else
+    read[-1] = 4;
+  return (0);
+}
+
+// Wrapper
+int Load_ReadX(HITS_DBX *dbx, int i, char *read, int ascii) {
+  if (dbx->data) {
+    return Load_Read_From_RAM(&dbx->db, dbx->data, i, read, ascii);
+  } else {
+    return Load_Read(&dbx->db, i, read, ascii);
+  }
+}
+
+// Wrapper
+void Close_DBX(HITS_DBX *dbx) {
+  Close_DB(&dbx->db);
+  if (dbx->data) free(dbx->data);
+}
diff --git a/DALIGNER/DBX.h b/DALIGNER/DBX.h
new file mode 100644
index 0000000..8fd9ace
--- /dev/null
+++ b/DALIGNER/DBX.h
@@ -0,0 +1,25 @@
+#ifndef DALIGNER_DBX_H
+#define DALIGNER_DBX_H
+/* Wrappers to extend HITS_DB.
+ *
+ * Note that none of the extra fields are ever stored on-disk.
+ */
+#include "DB.h"
+#include <stdbool.h>
+
+typedef struct {
+	HITS_DB db;
+/*
+ * When "data" is non-null, it stores the entire DB
+ * in memory, so we can avoid random-access disk operations.
+ * But if null, then wrappers simply delegate.
+ */
+	char* data;
+} HITS_DBX;
+
+int Open_DBX(char *path, HITS_DBX *dbx, bool preload);
+int  Load_ReadX(HITS_DBX *dbx, int i, char *read, int ascii);
+//void Trim_DB(HITS_DBX *dbx);
+void Close_DBX(HITS_DBX *dbx);
+
+#endif
diff --git a/DALIGNER/GNUmakefile b/DALIGNER/GNUmakefile
index d0ceb2f..3d81c36 100644
--- a/DALIGNER/GNUmakefile
+++ b/DALIGNER/GNUmakefile
@@ -15,6 +15,7 @@ vpath %.a ${THISDIR}/../DAZZ_DB
 all: ${ALL}
 daligner: filter.o
 daligner_p: filter_p.o
+LA4Falcon: DBX.o
 ${ALL}: align.o
 
 install:
diff --git a/DALIGNER/LA4Falcon.c b/DALIGNER/LA4Falcon.c
index 2ee07a6..ef642b7 100644
--- a/DALIGNER/LA4Falcon.c
+++ b/DALIGNER/LA4Falcon.c
@@ -1,3 +1,4 @@
+/* vim: set et ts=2 sts=2 sw=2 : */
 /************************************************************************************\
 *                                                                                    *
 * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved.                *
@@ -54,18 +55,13 @@
  *  Last Mod:  July 2015
  *
  *******************************************************************************************/
+#include "DB.h"
+#include "DBX.h"
+#include "align.h"
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdint.h>
-#include <ctype.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "DB.h"
-#include "align.h"
 
 #define MAX_OVERLAPS 50000
 
@@ -144,12 +140,13 @@ static bool add_overlap(const Alignment *aln, const Overlap *ovl, const int coun
     return added;
 }
 
-static void print_hits(const int hit_count, HITS_DB *db2, char *bbuffer, char buffer[], int64 bsize, const int MAX_HIT_COUNT) {
+static void print_hits(const int hit_count, HITS_DBX *dbx2, char *bbuffer, char buffer[], int64 bsize, const int MAX_HIT_COUNT) {
     int tmp_idx;
     qsort(ovlgrps, (hit_count+1), sizeof(OverlapGroup), compare_ovlgrps);
     for (tmp_idx = 0; tmp_idx < (hit_count+1) && tmp_idx < MAX_HIT_COUNT; tmp_idx++) {
         OverlapGroup *grp = &ovlgrps[tmp_idx];
-        Load_Read(db2, grp->end.bread, bbuffer, 0);
+        //Load_ReadX assuming db2 == db1 is true
+        Load_ReadX(dbx2, grp->end.bread, bbuffer, 0);
         if (COMP(grp->end.flags)) Complement_Seq(bbuffer, grp->blen );
         Upper_Read(bbuffer);
         int64 const rlen = (int64)(grp->end.path.bepos) - (int64)(grp->beg.path.bbpos);
@@ -178,8 +175,10 @@ static int ORDER(const void *l, const void *r)
 }
 
 int main(int argc, char *argv[])
-{ HITS_DB   _db1, *db1 = &_db1;
-  HITS_DB   _db2, *db2 = &_db2;
+{ HITS_DBX   _dbx1, *dbx1 = &_dbx1;
+  HITS_DBX   _dbx2, *dbx2 = &_dbx2;
+  HITS_DB *db1 = &dbx1->db;
+  HITS_DB *db2 = &dbx2->db;
   Overlap   _ovl, *ovl = &_ovl;
   Alignment _aln, *aln = &_aln;
 
@@ -196,6 +195,7 @@ int main(int argc, char *argv[])
   int     FALCON, OVERLAP, M4OVL;
   // XXX: MAX_HIT_COUNT should be renamed
   int     SEED_MIN, MAX_HIT_COUNT, SKIP;
+  int     PRELOAD;
 
   //  Process options
 
@@ -225,7 +225,7 @@ int main(int argc, char *argv[])
       if (argv[i][0] == '-')
         switch (argv[i][1])
         { default:
-            ARG_FLAGS("smfocargUFM")
+            ARG_FLAGS("smfocargUFMP")
             break;
           case 'i':
             ARG_NON_NEGATIVE(INDENT,"Indent")
@@ -259,6 +259,7 @@ int main(int argc, char *argv[])
     FALCON    = flags['f'];
     SKIP      = flags['s'];
     GROUP     = flags['g'];
+    PRELOAD   = flags['P']; // Preload DB reads, if possible.
 
     if (argc <= 2)
       { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
@@ -274,7 +275,7 @@ int main(int argc, char *argv[])
     FILE *input;
 
     ISTWO  = 0;
-    status = Open_DB(argv[1],db1);
+    status = Open_DBX(argv[1],dbx1,PRELOAD);
     if (status < 0)
       exit (1);
     if (db1->part > 0)
@@ -288,7 +289,7 @@ int main(int argc, char *argv[])
         if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL)
           { ISTWO = 1;
             fclose(input);
-            status = Open_DB(argv[2],db2);
+            status = Open_DBX(argv[2],dbx2,PRELOAD);
             if (status < 0)
               exit (1);
             if (db2->part > 0)
@@ -298,12 +299,16 @@ int main(int argc, char *argv[])
             Trim_DB(db2);
           }
         else
-          db2 = db1;
+            { dbx2 = dbx1;
+              db2 = db1;
+            }
         free(root);
         free(pwd);
       }
     else
-      db2 = db1;
+      { dbx2 = dbx1;
+        db2 = db1;
+      }
     Trim_DB(db1);
   }
 
@@ -697,16 +702,16 @@ int main(int argc, char *argv[])
         if (FALCON)
           {
             if (p_aread == -1) {
-                Load_Read(db1, ovl->aread, abuffer, 2);
+                Load_ReadX(dbx1, ovl->aread, abuffer, 2);
                 printf("%08d %s\n", ovl->aread, abuffer);
                 p_aread = ovl->aread;
                 skip_rest = 0;
             }
             if (p_aread != ovl -> aread ) {
-                print_hits(hit_count, db2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
+                print_hits(hit_count, dbx2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
                 hit_count = -1;
 
-                Load_Read(db1, ovl->aread, abuffer, 2);
+                Load_ReadX(dbx1, ovl->aread, abuffer, 2);
                 printf("%08d %s\n", ovl->aread, abuffer);
                 p_aread = ovl->aread;
                 skip_rest = 0;
@@ -725,8 +730,8 @@ int main(int argc, char *argv[])
                     tps = ((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace);
                     if (small)
                         Decompress_TraceTo16(ovl);
-                    Load_Read(db1, ovl->aread, abuffer, 0);
-                    Load_Read(db2, ovl->bread, bbuffer, 0);
+                    Load_ReadX(dbx1, ovl->aread, abuffer, 0);
+                    Load_ReadX(dbx2, ovl->bread, bbuffer, 0);
                     if (COMP(aln->flags))
                         Complement_Seq(bbuffer, aln->blen);
                     Compute_Trace_PTS(aln,work,tspace);
@@ -828,7 +833,7 @@ int main(int argc, char *argv[])
 
     if (FALCON && hit_count != -1)
       {
-        print_hits(hit_count, db2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
+        print_hits(hit_count, dbx2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
         printf("- -\n");
         free(ovlgrps);
       }
@@ -842,9 +847,8 @@ int main(int argc, char *argv[])
       }
   }
 
-  Close_DB(db1);
+  Close_DBX(dbx1);
   if (ISTWO)
-    Close_DB(db2);
-
+    Close_DBX(dbx2);
   exit (0);
 }
diff --git a/DALIGNER/bamboo_build.sh b/DALIGNER/bamboo_build.sh
new file mode 100644
index 0000000..b9193e5
--- /dev/null
+++ b/DALIGNER/bamboo_build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -ex
+type module >& /dev/null || source /mnt/software/Modules/current/init/bash
+
+set -vex
+module load gcc/4.9.2
+module load git/2.8.3
+module load ccache
+NEXUS_BASEURL=http://ossnexus.pacificbiosciences.com/repository
+NEXUS_URL=$NEXUS_BASEURL/unsupported/gcc-4.9.2
+
+rm -rf prebuilt build
+mkdir -p prebuilt/DAZZ_DB build/bin
+curl -s -L $NEXUS_URL/DAZZ_DB-SNAPSHOT.tgz|tar zxf - -C prebuilt/DAZZ_DB
+mkdir -p DAZZ_DB
+cp prebuilt/DAZZ_DB/lib/*.a DAZZ_DB/
+cp prebuilt/DAZZ_DB/include/*.h DAZZ_DB/
+
+make -C DALIGNER clean
+make -C DALIGNER LIBDIRS=$PWD/prebuilt/DAZZ_DB/lib -j
+make -C DALIGNER PREFIX=$PWD/build install
+
+make -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile clean
+PATH=.:${PATH} make -C DALIGNER -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile
+make -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile clean
+
+cd build
+tar zcf DALIGNER-SNAPSHOT.tgz bin
+curl -v -n --upload-file DALIGNER-SNAPSHOT.tgz $NEXUS_URL/DALIGNER-SNAPSHOT.tgz
diff --git a/DAZZ_DB/bamboo_build.sh b/DAZZ_DB/bamboo_build.sh
new file mode 100644
index 0000000..5748fa4
--- /dev/null
+++ b/DAZZ_DB/bamboo_build.sh
@@ -0,0 +1,23 @@
+#!/bin/bash -xe
+type module >& /dev/null || source /mnt/software/Modules/current/init/bash
+
+set -vex
+
+module load gcc/4.9.2
+module load git/2.8.3
+module load ccache
+
+rm -rf build
+mkdir -p build/lib build/bin build/include
+cd DAZZ_DB
+make clean
+make -j 
+make PREFIX=$PWD/../build install
+cp *.h ../build/include
+cd -
+cd build
+tar zcf DAZZ_DB-SNAPSHOT.tgz bin lib include
+NEXUS_BASEURL=http://ossnexus.pacificbiosciences.com/repository
+NEXUS_URL=$NEXUS_BASEURL/unsupported/gcc-4.9.2
+curl -v -n --upload-file DAZZ_DB-SNAPSHOT.tgz $NEXUS_URL/DAZZ_DB-SNAPSHOT.tgz
+cd -
diff --git a/FALCON-examples/git-sym.makefile b/FALCON-examples/git-sym.makefile
index ed9b703..c7b7fb5 100644
--- a/FALCON-examples/git-sym.makefile
+++ b/FALCON-examples/git-sym.makefile
@@ -24,8 +24,8 @@ synth0.ref.fasta:
 arab-creads.fasta:
 	cp -f /lustre/hpcprod/cdunn/data/arab_test/corrected.fasta $@
 synth5k.2016-11-02:
-	curl -L https://downloads.pacbcloud.com/public/data/git-sym/synth5k.2016-11-02.tgz | tar xvfz -
+	curl -kL https://downloads.pacbcloud.com/public/data/git-sym/synth5k.2016-11-02.tgz | tar xvfz -
 ecoli.m140913_050931_42139_c100713652400000001823152404301535_s1_p0:
 	curl -L https://downloads.pacbcloud.com/public/data/git-sym/ecoli.m140913_050931_42139_c100713652400000001823152404301535_s1_p0.subreads.tar | tar xvf -
-greg200k-sv2:
-	curl -L https://downloads.pacbcloud.com/public/data/git-sym/greg200k-sv2.tar | tar xvf -
+greg200k-sv2.2:
+	curl -L https://downloads.pacbcloud.com/public/data/git-sym/greg200k-sv2.2.tar | tar xvf -
diff --git a/FALCON-examples/makefile b/FALCON-examples/makefile
index d600c35..948eb23 100644
--- a/FALCON-examples/makefile
+++ b/FALCON-examples/makefile
@@ -1,7 +1,7 @@
 default:
 	@echo 'Try "make run-foo" for any sub-dir of run/.'
 run-%: setup-%
-	cd run/$*; fc_run.py fc_run.cfg logging.ini
+	cd run/$*; rm -rf 0-rawreads/ 1-preads_ovl/ 2-asm-falcon/; ls -l; fc_run.py fc_run.cfg logging.ini
 setup-%:
 	git-sym update run/$*
 	git-sym show run/$*
diff --git a/FALCON-examples/run/ecoli/fc_run.cfg b/FALCON-examples/run/ecoli/fc_run.cfg
index c2d5682..12a24ce 100644
--- a/FALCON-examples/run/ecoli/fc_run.cfg
+++ b/FALCON-examples/run/ecoli/fc_run.cfg
@@ -1,5 +1,9 @@
 [General]
-#job_type = local
+use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD}
+job_queue = bash -C ${CMD} > ${STDOUT_FILE} 2> ${STDERR_FILE}
 
 # list of files of the initial bas.h5 files
 input_fofn = input.fofn
@@ -25,6 +29,8 @@ sge_option_cns = -pe smp 8 -q %(jobqueue)s
 
 pa_concurrent_jobs = 32
 ovlp_concurrent_jobs = 32
+pa_concurrent_jobs = 6
+ovlp_concurrent_jobs = 6
 
 pa_HPCdaligner_option =  -v -B4 -t16 -e.70 -l1000 -s1000
 ovlp_HPCdaligner_option = -v -B4 -t32 -h60 -e.96 -l500 -s1000
diff --git a/FALCON-examples/run/greg200k-sv2/fc_run.cfg b/FALCON-examples/run/greg200k-sv2/fc_run.cfg
index a8fb8c4..91c1e1c 100755
--- a/FALCON-examples/run/greg200k-sv2/fc_run.cfg
+++ b/FALCON-examples/run/greg200k-sv2/fc_run.cfg
@@ -5,9 +5,22 @@ input_fofn = input.fofn
 
 job_type = local
 
+use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD}
+
 input_type = raw
 #input_type = preads
 
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
 #openending = True
 
 # The length cutoff used for seed reads used for initial mapping
diff --git a/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg b/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg
index c9de3af..2b6f9cd 100644
--- a/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg
+++ b/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg
@@ -2,13 +2,25 @@
 job_type = SGE
 job_type = local
 
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+#job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
+max_n_open_files = 1000
+
 [Unzip]
 
 input_fofn= input.fofn
 input_bam_fofn= input_bam.fofn
 #smrt_bin= /mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/
 #smrt_bin=/mnt/secondary/builds/full/3.0.1/prod/current-build_smrtanalysis/smrtcmds/bin/
-smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
+#smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
+#smrt_bin=/mnt/secondary/builds/full/3.2.0/prod/current-build_smrttools-incremental/smrtcmds/bin/ # does not work
+smrt_bin=/pbi/dept/secondary/builds/4.1.0/current_smrttools_incremental_installdir/smrtcmds/bin
 sge_phasing= -pe smp 12 -q bigmem
 sge_quiver= -pe smp 12 -q sequel-farm
 sge_track_reads= -pe smp 12 -q default
diff --git a/FALCON-examples/run/greg200k-sv2/input.fofn b/FALCON-examples/run/greg200k-sv2/input.fofn
index 99cce2d..84a7c53 100644
--- a/FALCON-examples/run/greg200k-sv2/input.fofn
+++ b/FALCON-examples/run/greg200k-sv2/input.fofn
@@ -1,2 +1,2 @@
-data/greg200k-sv2/subreads1.dexta
-data/greg200k-sv2/subreads2.dexta
+data/greg200k-sv2/subreads1.fasta
+data/greg200k-sv2/subreads2.fasta
diff --git a/FALCON-examples/run/greg200k-sv2/input_bam.fofn b/FALCON-examples/run/greg200k-sv2/input_bam.fofn
new file mode 100644
index 0000000..af301f1
--- /dev/null
+++ b/FALCON-examples/run/greg200k-sv2/input_bam.fofn
@@ -0,0 +1,2 @@
+data/greg200k-sv2/subreads1.bam
+data/greg200k-sv2/subreads2.bam
diff --git a/FALCON-examples/run/synth0/fc_run.cfg b/FALCON-examples/run/synth0/fc_preads.cfg
similarity index 71%
copy from FALCON-examples/run/synth0/fc_run.cfg
copy to FALCON-examples/run/synth0/fc_preads.cfg
index 0740760..2d92250 100644
--- a/FALCON-examples/run/synth0/fc_run.cfg
+++ b/FALCON-examples/run/synth0/fc_preads.cfg
@@ -1,15 +1,24 @@
 [General]
-use_tmpdir = true
-job_type = local
+#use_tmpdir = true
+#job_type = local
 #job_type = sge
-#stop_all_jobs_on_failure = true
+stop_all_jobs_on_failure = true
+
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
 
 # list of files of the initial bas.h5 files
-input_fofn = input.fofn
-#input_fofn = preads.fofn
+#input_fofn = input.fofn
+input_fofn = preads.fofn
 
-input_type = raw
-#input_type = preads
+#input_type = raw
+input_type = preads
 
 # The length cutoff used for seed reads used for initial mapping
 #length_cutoff = 1
@@ -20,7 +29,7 @@ seed_coverage = 20
 length_cutoff_pr = 1
 
 
-job_queue = production
+#job_queue = production
 sge_option_da = -pe smp 8 -q %(job_queue)s
 sge_option_la = -pe smp 2 -q %(job_queue)s
 sge_option_pda = -pe smp 8 -q %(job_queue)s
@@ -40,6 +49,7 @@ pa_DBsplit_option =   -a -x5 -s.065536
 #pa_DBsplit_option =   -a -x5 -s1
 ovlp_DBsplit_option = -a -x5 -s50
 
+LA4Falcon_preload = true
 falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 1 --max_n_read 20000 --n_core 0
 #--min_cov_aln 1 --min_len_aln 40
 
diff --git a/FALCON-examples/run/synth0/fc_run.cfg b/FALCON-examples/run/synth0/fc_run.cfg
index 0740760..6de3553 100644
--- a/FALCON-examples/run/synth0/fc_run.cfg
+++ b/FALCON-examples/run/synth0/fc_run.cfg
@@ -1,8 +1,18 @@
 [General]
-use_tmpdir = true
-job_type = local
+#use_tmpdir = true
+#job_type = local
 #job_type = sge
-#stop_all_jobs_on_failure = true
+stop_all_jobs_on_failure = true
+
+#skip_checks = true
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
 
 # list of files of the initial bas.h5 files
 input_fofn = input.fofn
@@ -20,7 +30,7 @@ seed_coverage = 20
 length_cutoff_pr = 1
 
 
-job_queue = production
+#job_queue = production
 sge_option_da = -pe smp 8 -q %(job_queue)s
 sge_option_la = -pe smp 2 -q %(job_queue)s
 sge_option_pda = -pe smp 8 -q %(job_queue)s
@@ -40,6 +50,7 @@ pa_DBsplit_option =   -a -x5 -s.065536
 #pa_DBsplit_option =   -a -x5 -s1
 ovlp_DBsplit_option = -a -x5 -s50
 
+LA4Falcon_preload = true
 falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 1 --max_n_read 20000 --n_core 0
 #--min_cov_aln 1 --min_len_aln 40
 
diff --git a/FALCON-examples/run/synth0/makefile b/FALCON-examples/run/synth0/makefile
index e7ef88e..14ca0ea 100644
--- a/FALCON-examples/run/synth0/makefile
+++ b/FALCON-examples/run/synth0/makefile
@@ -1,10 +1,12 @@
 # This will show 'shift by 273', but we do not mind if the shift changes,
 # since it is circular. We just want output to match input with some shift,
 # and maybe with reverse-complement.
+FC_CFG?=fc_run.cfg
+
 go: run
 	${MAKE} test
 run:
-	fc_run fc_run.cfg logging.json
+	fc_run ${FC_CFG} logging.json
 test:
 	./check.py
 clean:
diff --git a/FALCON-examples/run/synth0/preads.fofn b/FALCON-examples/run/synth0/preads.fofn
new file mode 100644
index 0000000..63753e6
--- /dev/null
+++ b/FALCON-examples/run/synth0/preads.fofn
@@ -0,0 +1,2 @@
+data/preads/cns_00001.fasta
+data/preads/cns_00002.fasta
diff --git a/FALCON/.travis.yml b/FALCON/.travis.yml
index dd67f33..baa1e88 100644
--- a/FALCON/.travis.yml
+++ b/FALCON/.travis.yml
@@ -12,7 +12,9 @@
 #sudo: required
 os:
   - linux
-language: python
+#language: python # This seems to cause virtualenv, which we do not want. We prefer a --user install.
+# But to speed-up start-up,
+language: c
 compiler:
   - clang  # hmm. distutils uses 'gcc' anyway
 #  - gcc
diff --git a/FALCON/bamboo_build.sh b/FALCON/bamboo_build.sh
new file mode 100644
index 0000000..f003289
--- /dev/null
+++ b/FALCON/bamboo_build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+module unload git gcc ccache
+module load git/2.8.3
+module load gcc/4.9.2
+module load ccache/3.2.3
+#module load make
+
+set -vx
+git --version
+which gcc
+which g++
+gcc --version
+# We cannot use /bin/python without /bin/gcc.
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+
+make install-edit
+# Note: no --edit because we might be building artifacts.
+# ... Scratch that. We have trouble getting coverage for
+#  source=falcon_kit
+# but maybe it will work with a --edit install.
+
+make pylint
diff --git a/FALCON/bamboo_test.sh b/FALCON/bamboo_test.sh
new file mode 100644
index 0000000..e8876be
--- /dev/null
+++ b/FALCON/bamboo_test.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+set -vex
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+export PATH=$PYTHONUSERBASE/bin:$PATH
+
+pip install --user pytest coverage
+#make test
+make coverage-install
+make coverage
+chmod -R ugo+rwx .
diff --git a/FALCON/falcon_kit/FastaReader.py b/FALCON/falcon_kit/FastaReader.py
index e1a7780..81c8d8c 100644
--- a/FALCON/falcon_kit/FastaReader.py
+++ b/FALCON/falcon_kit/FastaReader.py
@@ -1,44 +1,18 @@
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
 from os.path import abspath, expanduser
 from cStringIO import StringIO
+import contextlib
+import gzip
 import md5
 import re
+import subprocess
+
+##
+## Utility functions for FastaReader
+##
+def wrap(s, columns):
+    return "\n".join(s[start:start+columns]
+                     for start in xrange(0, len(s), columns))
+
 
 def splitFastaHeader( name ):
     """
@@ -70,51 +44,6 @@ def splitFileContents(f, delimiter, BLOCKSIZE=8192):
             remainder.write(part)
     yield remainder.getvalue()
 
-def isFileLikeObject(o):
-    return hasattr(o, "read") and hasattr(o, "write")
-
-def getFileHandle(filenameOrFile, mode="r"):
-    """
-    Given a filename not ending in ".gz", open the file with the
-    appropriate mode.
-    Given a filename ending in ".gz", return a filehandle to the
-    unzipped stream.
-    Given a file object, return it unless the mode is incorrect--in
-    that case, raise an exception.
-    """
-    assert mode in ("r", "w")
-
-    if isinstance(filenameOrFile, basestring):
-        filename = abspath(expanduser(filenameOrFile))
-        if filename.endswith(".gz"):
-            return gzip.open(filename, mode)
-        else:
-            return open(filename, mode)
-    elif isFileLikeObject(filenameOrFile):
-        return filenameOrFile
-    else:
-        raise Exception("Invalid type to getFileHandle")
-
-
-class ReaderBase(object):
-    def __init__(self, f):
-        """
-        Prepare for iteration through the records in the file
-        """
-        self.file = getFileHandle(f, "r")
-
-    def close(self):
-        """
-        Close the underlying file
-        """
-        self.file.close()
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
 
 class FastaRecord(object):
     """
@@ -198,18 +127,6 @@ class FastaRecord(object):
         except AssertionError:
             raise ValueError("String not recognized as a valid FASTA record")
 
-    def reverseComplement(self, preserveHeader=False):
-        """
-        Return a new FastaRecord with the reverse-complemented DNA sequence.
-        Optionally, supply a name
-        """
-        rcSequence = sequences.reverseComplement(self.sequence)
-        if preserveHeader:
-            return FastaRecord(self.name, rcSequence)
-        else:
-            rcName = '{0} [revcomp]'.format(self.name.strip())
-            return FastaRecord(rcName, rcSequence)
-
     def __eq__(self, other):
         if isinstance(other, self.__class__):
             return (self.name     == other.name and
@@ -229,8 +146,34 @@ class FastaRecord(object):
             wrap(self.sequence, self.COLUMNS)
 
 
-class FastaReader(ReaderBase):
+# These are refactored from ReaderBase/FastaReader.
+
+def yield_fasta_records(f, fn):
+    """
+    f: fileobj
+    fn: str - filename (for exceptions)
     """
+    try:
+        parts = splitFileContents(f, ">")
+        assert "" == next(parts)
+        for part in parts:
+            yield FastaRecord.fromString(">" + part)
+    except AssertionError:
+        raise Exception("Invalid FASTA file {!r}".format(fn))
+
+
+def stream_stdout(call, fn):
+    args = call.split()
+    proc = subprocess.Popen(args, stdin=open(fn), stdout=subprocess.PIPE)
+    return proc.stdout
+
+ at contextlib.contextmanager
+def open_fasta_reader(fn):
+    """
+    fn: str - filename
+
+    Note: If you already have a fileobj, you can iterate over yield_fasta_records() directly.
+
     Streaming reader for FASTA files, useable as a one-shot iterator
     over FastaRecord objects.  Agnostic about line wrapping.
     Example:
@@ -239,22 +182,32 @@ class FastaReader(ReaderBase):
         > from pbcore import data
         > filename = data.getTinyFasta()
         > r = FastaReader(filename)
-        > for record in r:
+        > with open_fasta_reader(filename) as r:
+        ...  for record in r:
         ...     print record.name, len(record.sequence), record.md5
         ref000001|EGFR_Exon_2 183 e3912e9ceacd6538ede8c1b2adda7423
         ref000002|EGFR_Exon_3 203 4bf218da37175a91869033024ac8f9e9
         ref000003|EGFR_Exon_4 215 245bc7a046aad0788c22b071ed210f4d
         ref000004|EGFR_Exon_5 157 c368b8191164a9d6ab76fd328e2803ca
-        > r.close()
     """
-    DELIMITER = ">"
+    filename = abspath(expanduser(fn))
+    mode = 'r'
+    if filename.endswith(".gz"):
+        ofs = gzip.open(filename, mode)
+    elif filename.endswith(".dexta"):
+        ofs = stream_stdout("undexta -vkU -w60 -i", filename)
+    else:
+        ofs = open(filename, mode)
+    yield yield_fasta_records(ofs, filename)
+    ofs.close()
 
-    def __iter__(self):
-        try:
-            parts = splitFileContents(self.file, ">")
-            assert "" == next(parts)
-            for part in parts:
-                yield FastaRecord.fromString(">" + part)
-        except AssertionError:
-            raise ValueError("Invalid FASTA file")
 
+class FastaReader(object):
+    """Deprecated, but should still work (with filenames).
+    """
+    def __iter__(self):
+        with open_fasta_reader(self.filename) as reader:
+            for rec in reader:
+                yield rec
+    def __init__(self, f):
+        self.filename = f
diff --git a/FALCON/falcon_kit/__init__.py b/FALCON/falcon_kit/__init__.py
index 2e1685f..df872a8 100644
--- a/FALCON/falcon_kit/__init__.py
+++ b/FALCON/falcon_kit/__init__.py
@@ -1,39 +1 @@
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
 from .falcon_kit import *
diff --git a/FALCON/falcon_kit/bash.py b/FALCON/falcon_kit/bash.py
index e0880e8..df34cdb 100644
--- a/FALCON/falcon_kit/bash.py
+++ b/FALCON/falcon_kit/bash.py
@@ -194,6 +194,7 @@ def script_build_rdb(config, input_fofn_fn, run_jobs_bfn):
         mdust = ''
     params.update(locals())
     script = """\
+echo "PBFALCON_ERRFILE=$PBFALCON_ERRFILE"
 set -o pipefail
 #fc_fasta2fasta < {input_fofn_fn} >| fc.fofn
 while read fn; do  {cat_fasta} $fn | fasta2DB -v raw_reads -i${{fn##*/}}; done < {input_fofn_fn}
@@ -307,8 +308,9 @@ rmfollow() {
             if not line.startswith('LAmerge'):
                 continue
             las_files = [word + '.las' for word in functional.yield_args_from_line(line)]
-            assert las_fn == os.path.basename(las_files[0])
-            script.extend('rmfollow {}'.format(fn) for fn in las_files[1:])
+            #las_fn = os.path.basename(las_files[0])
+            #assert las_fn == os.path.basename(las_files[0])
+            script.extend('# rmfollow {}'.format(fn) for fn in las_files[1:])
             break
 
         content = bash_funcs + '\n'.join(script + [''])
@@ -327,13 +329,16 @@ def script_run_consensus(config, db_fn, las_fn, out_file_bfn):
     else:
         bash_cutoff = '{}'.format(length_cutoff)
     params.update(locals())
+    LA4Falcon_flags = 'P' if params.get('LA4Falcon_preload') else ''
     if config["falcon_sense_skip_contained"]:
-        run_consensus = """LA4Falcon -H$CUTOFF -fso {db_fn} {las_fn} | """
+        LA4Falcon_flags += 'fso'
     elif config["falcon_sense_greedy"]:
-        run_consensus = """LA4Falcon -H$CUTOFF -fog  {db_fn} {las_fn} | """
+        LA4Falcon_flags += 'fog'
     else:
-        run_consensus = """LA4Falcon -H$CUTOFF -fo  {db_fn} {las_fn} | """
-    run_consensus += """fc_consensus {falcon_sense_option} >| {out_file_bfn}"""
+        LA4Falcon_flags += 'fo'
+    if LA4Falcon_flags:
+        LA4Falcon_flags = '-' + ''.join(set(LA4Falcon_flags))
+    run_consensus = "LA4Falcon -H$CUTOFF %s {db_fn} {las_fn} | fc_consensus {falcon_sense_option} >| {out_file_bfn}"%LA4Falcon_flags
 
     if config.get('dazcon', False):
         run_consensus = """
@@ -378,6 +383,6 @@ def script_run_report_pre_assembly(i_raw_reads_db_fn, i_preads_fofn_fn, genome_l
     params = dict()
     params.update(locals())
     script = """\
-python -m falcon_kit.mains.report_pre_assembly --genome-length {genome_length} --length-cutoff {length_cutoff} --db {i_raw_reads_db_fn} --preads-fofn {i_preads_fofn_fn} --out {o_json_fn}
+python2.7 -m falcon_kit.mains.report_pre_assembly --genome-length {genome_length} --length-cutoff {length_cutoff} --db {i_raw_reads_db_fn} --preads-fofn {i_preads_fofn_fn} --out {o_json_fn}
 """
     return script.format(**params)
diff --git a/FALCON/falcon_kit/falcon_kit.py b/FALCON/falcon_kit/falcon_kit.py
index 0d01659..e87e2d6 100644
--- a/FALCON/falcon_kit/falcon_kit.py
+++ b/FALCON/falcon_kit/falcon_kit.py
@@ -1,40 +1,4 @@
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
+from __future__ import absolute_import
 __all__ = [
     'kup', 'DWA', 'falcon',
     'KmerLookup', 'KmerMatch', 'AlnRange', 'ConsensusData',
@@ -42,6 +6,7 @@ __all__ = [
     ]
 
 from ctypes import *
+import os
 import ext_falcon
 #module_path = os.path.split(__file__)[0]
 
@@ -70,8 +35,12 @@ class ConsensusData(Structure):
     _fields_ = [ ("sequence", c_char_p),
                  ("eff_cov", POINTER(c_uint)) ]
 
-
-falcon_dll = CDLL(ext_falcon.__file__)
+try:
+    falcon_dll = CDLL(ext_falcon.__file__)
+except OSError:
+    # It seems that setup.py has changed the __file__ it attaches to an extension module.
+    # I have no idea why or why, but this works around it.
+    falcon_dll = CDLL(os.path.join(os.path.dirname(__file__), '..', os.path.basename(ext_falcon.__file__)))
 
 kup = falcon_dll
 
diff --git a/FALCON/falcon_kit/fc_asm_graph.py b/FALCON/falcon_kit/fc_asm_graph.py
index 1f0e376..3cc7984 100644
--- a/FALCON/falcon_kit/fc_asm_graph.py
+++ b/FALCON/falcon_kit/fc_asm_graph.py
@@ -1,42 +1,6 @@
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
+from __future__ import absolute_import
+from .FastaReader import open_fasta_reader
 import networkx as nx
-from FastaReader import FastaReader
 
 RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
 
@@ -90,8 +54,8 @@ class AsmGraph(object):
 
         seqs = {}
         # load all p-read name into memory
-        f = FastaReader(fasta_fn)
-        for r in f:
+        with open_fasta_reader(fasta_fn) as f:
+          for r in f:
             if r.name not in all_read_ids:
                 continue
             seqs[r.name] = r.sequence.upper()
diff --git a/FALCON/falcon_kit/functional.py b/FALCON/falcon_kit/functional.py
index b9495d5..04ba8de 100644
--- a/FALCON/falcon_kit/functional.py
+++ b/FALCON/falcon_kit/functional.py
@@ -6,7 +6,7 @@ import re
 import StringIO
 
 def _verify_pairs(pairs1, pairs2):
-    if pairs1 != pairs2:
+    if pairs1 != pairs2: # pragma: no cover
         print('pair2dali:', pairs1)
         print('pair2sort:', pairs2)
         print('dali-sort:', set(pairs1) - set(pairs2))
@@ -34,8 +34,8 @@ def get_daligner_job_descriptions_sans_LAcheck(run_jobs_stream, db_prefix, singl
     result = {}
     for k,v in descs.iteritems():
         bash = skip_LAcheck(v)
-        bash = bash.replace('LAsort', 'python -m falcon_kit.mains.LAsort {}'.format(db_prefix))
-        bash = bash.replace('LAmerge', 'python -m falcon_kit.mains.LAmerge {}'.format(db_prefix))
+        bash = bash.replace('LAsort', 'python2.7 -m falcon_kit.mains.LAsort {}'.format(db_prefix))
+        bash = bash.replace('LAmerge', 'python2.7 -m falcon_kit.mains.LAmerge {}'.format(db_prefix))
         result[k] = bash
     return result
 
@@ -79,7 +79,7 @@ def get_daligner_job_descriptions(run_jobs_stream, db_prefix, single=False):
         Can return [('', '')] if only 1 block.
         """
         mo = re_pair_sort.search(line)
-        if not mo:
+        if not mo: # pragma: no cover
             raise Exception('Pattern {!r} does not match line {!r}'.format(
                 re_pair_sort.pattern, line))
         return mo.group(1, 2)
@@ -177,7 +177,7 @@ def get_las_filenames(mjob_data, db_prefix):
         mo = regex.search(bash_lines[i])
         if not mo:
             raise Exception('Regex {!r} failed on {!r}'.format(
-                re_las_name.pattern, bash_lines[i]))
+                regex.pattern, bash_lines[i]))
         las_fn = mo.group(1) + '.las'
         result[p_id] = las_fn
     return result
@@ -234,20 +234,26 @@ def get_script_xformer(pread_aln):
     else:
         return xform_script_for_raw_reads
 
+class GenomeCoverageError(Exception):
+    pass
+
 def calc_cutoff_from_reverse_sorted_readlength_counts(rl_counts, target):
     """Return first read_len which gives at least 'target' bases.
     """
     total = sum(pair[0]*pair[1] for pair in rl_counts)
     subtotal = 0
-    assert target <= total, 'Not enough genome coverage (target={} < actual={})'.format(target, total)
+    if target > total:
+        msg = 'Not enough reads available for desired genome coverage (bases needed={} > actual={})'.format(target, total)
+        raise GenomeCoverageError(msg)
     cutoff = 0
     for (rl, count) in rl_counts:
         subtotal += rl*count
         if subtotal >= target:
             cutoff = rl
             break
-    else:
-        raise Exception('Impossible target: target={target}, subtotal={subtotal}, total={total}'.format(locals()))
+    else: # pragma: no cover
+        msg = 'Impossible target (probably a bug): target={target}, subtotal={subtotal}, total={total}'.format(locals())
+        raise Exception(msg)
     return cutoff
 
 def num2int(num):
diff --git a/FALCON/falcon_kit/mains/actg_coordinate.py b/FALCON/falcon_kit/mains/actg_coordinate.py
index 52d34d1..b025d97 100644
--- a/FALCON/falcon_kit/mains/actg_coordinate.py
+++ b/FALCON/falcon_kit/mains/actg_coordinate.py
@@ -1,4 +1,4 @@
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
 
 
 def main(argv=None):
@@ -19,9 +19,9 @@ def main(argv=None):
             p_ctg_coor_map[ctg_id][w] = coor
 
 
-  a_ctg_fasta = FastaReader("a_ctg.fa")
-  for r in a_ctg_fasta:
-    rid = r.name.split()
-    rid, v, w = rid[:3]
-    pid = rid.split("-")[0]
-    print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
+  with open_fasta_reader("a_ctg.fa") as a_ctg_fasta:
+    for r in a_ctg_fasta:
+      rid = r.name.split()
+      rid, v, w = rid[:3]
+      pid = rid.split("-")[0]
+      print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
diff --git a/FALCON/falcon_kit/mains/calc_cutoff.py b/FALCON/falcon_kit/mains/calc_cutoff.py
index 4d8b974..f0492a0 100644
--- a/FALCON/falcon_kit/mains/calc_cutoff.py
+++ b/FALCON/falcon_kit/mains/calc_cutoff.py
@@ -1,6 +1,8 @@
 from .. import functional as f
 import argparse
+import os
 import sys
+import traceback
 
 def main(argv=sys.argv):
     import argparse
@@ -15,6 +17,9 @@ This is useful when length_cutoff is not provided but the genome-size
 can be estimated. The purpose is to *reduce* the amount of data seen by
 DALIGNER, since otherwise it will miss many alignments when it
 encounters resource limits.
+
+Note: If PBFALCON_ERRFILE is defined (and its directory is writable),
+we will write errors there in addition to stderr.
 """
     parser = argparse.ArgumentParser(description=description, epilog=epilog,
             formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -22,14 +27,25 @@ encounters resource limits.
             help='Desired coverage ratio (i.e. over-sampling)')
     parser.add_argument('genome_size', type=int,
             help='Estimated number of bases in genome. (haploid?)')
-    parser.add_argument('capture', default='-',
+    parser.add_argument('capture', #default='-', # I guess default is not allowed for required args.
             help='File with captured output of DBstats. (Otherwise, stdin.)')
     args = parser.parse_args(argv[1:])
 
     target = int(args.genome_size * args.coverage)
     capture = open(args.capture) if args.capture!='-' else sys.stdin
     stats = capture.read()
-    cutoff = f.calc_cutoff(target, stats)
+    try:
+        cutoff = f.calc_cutoff(target, stats)
+    except Exception:
+        msg = traceback.format_exc()
+        msg += 'User-provided genome_size: {}\nDesired coverage: {}\n'.format(
+            args.genome_size, args.coverage)
+        # pbfalcon wants us to write errs here.
+        errfile = os.environ.get('PBFALCON_ERRFILE')
+        if errfile:
+            with open(errfile, 'w') as ofs:
+                ofs.write(msg)
+        raise Exception(msg)
     sys.stdout.write(str(cutoff))
 
 if __name__ == "__main__":
diff --git a/FALCON/falcon_kit/mains/dedup_a_tigs.py b/FALCON/falcon_kit/mains/dedup_a_tigs.py
index b8df550..0ed036a 100644
--- a/FALCON/falcon_kit/mains/dedup_a_tigs.py
+++ b/FALCON/falcon_kit/mains/dedup_a_tigs.py
@@ -1,4 +1,4 @@
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
 import argparse
 import sys
 
@@ -13,8 +13,8 @@ def parse_args(argv):
 
 def main(argv=sys.argv):
     args = parse_args(argv)
-    reads = FastaReader("a_ctg_all.fa")
-    with open("a_ctg.fa","w") as f:
+    with open_fasta_reader("a_ctg_all.fa") as reads:
+      with open("a_ctg.fa","w") as f:
         for r in reads:
             tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
             if 100*float(idt) > args.max_idt and 100*float(cov) > args.max_aln_cov and\
diff --git a/FALCON/falcon_kit/mains/fetch_reads.py b/FALCON/falcon_kit/mains/fetch_reads.py
index 9f7b458..237c6a0 100644
--- a/FALCON/falcon_kit/mains/fetch_reads.py
+++ b/FALCON/falcon_kit/mains/fetch_reads.py
@@ -1,5 +1,6 @@
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
 import argparse
+import contextlib
 import os
 import glob
 import sys
@@ -25,9 +26,9 @@ def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
         rid = int(fid.split('/')[1])/10
         return rid_to_oid[int(rid)]
 
-    ref_fasta = FastaReader(ctg_fa)
-    all_ctg_ids = set()
-    for s in ref_fasta:
+    with open_fasta_reader(ctg_fa) as ref_fasta:
+      all_ctg_ids = set()
+      for s in ref_fasta:
         s_id = s.name.split()[0]
         if ctg_id != 'all' and s_id != ctg_id:
             continue
@@ -81,11 +82,22 @@ def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
             print >>f, ctg_id
 
     read_out_files = {}
+    @contextlib.contextmanager
+    def reopened_fasta_out(ctg_id):
+                # A convenient closure, with a contextmanager.
+                if ctg_id not in read_out_files:
+                    read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'w' )
+                    read_out_files[ctg_id] = 1
+                else:
+                    read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'a' )
+                yield read_out
+                read_out.close()
+
     with open(read_fofn, 'r') as f:
         for r_fn in f:
             r_fn = r_fn.strip()
-            read_fa_file = FastaReader(r_fn)
-            for r in read_fa_file:
+            with open_fasta_reader(r_fn) as read_fa_file:  # will soon handle .dexta too
+              for r in read_fa_file:
                 rid = r.name.split()[0]
                 if rid not in read_set:
                     ctg_id = 'unassigned'
@@ -95,15 +107,9 @@ def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
                 if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
                     ctg_id = 'unassigned'
 
-                if ctg_id not in read_out_files:
-                    read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'w' )
-                    read_out_files[ctg_id] = 1
-                else:
-                    read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'a' )
-
-                print >>read_out, '>'+rid
-                print >>read_out, r.sequence
-                read_out.close()
+                with reopened_fasta_out(ctg_id) as read_out:
+                    print >>read_out, '>'+rid
+                    print >>read_out, r.sequence
 
 def parse_args(argv):
     parser = argparse.ArgumentParser(description='using the read to contig mapping data to partition the reads grouped by contigs')
diff --git a/FALCON/falcon_kit/mains/get_read_ctg_map.py b/FALCON/falcon_kit/mains/get_read_ctg_map.py
index 0ef0825..e96528f 100644
--- a/FALCON/falcon_kit/mains/get_read_ctg_map.py
+++ b/FALCON/falcon_kit/mains/get_read_ctg_map.py
@@ -1,8 +1,9 @@
 from __future__ import absolute_import
+from .. import pype_tasks
+# pylint: disable=no-name-in-module, import-error, fixme, line-too-long
 from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
         makePypeLocalFile, fn, PypeTask)
 PypeThreadTaskBase = MyFakePypeThreadTaskBase
-from .. import pype_tasks
 import argparse
 import glob
 import logging
diff --git a/FALCON/falcon_kit/mains/graph_to_contig.py b/FALCON/falcon_kit/mains/graph_to_contig.py
index d004837..483709e 100644
--- a/FALCON/falcon_kit/mains/graph_to_contig.py
+++ b/FALCON/falcon_kit/mains/graph_to_contig.py
@@ -1,6 +1,6 @@
 import networkx as nx
 #from pbcore.io import FastaReader
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
 from falcon_kit import kup, falcon, DWA
 
 read_fasta = "preads4falcon.fasta"
@@ -76,8 +76,8 @@ def main(argv=None):
 
     seqs = {}
     # load all p-read name into memory
-    f = FastaReader(read_fasta)
-    for r in f:
+    with open_fasta_reader(read_fasta) as f:
+      for r in f:
         if r.name not in reads_in_layout:
             continue
         seqs[r.name] = r.sequence.upper()
diff --git a/FALCON/falcon_kit/mains/ovlp_filter.py b/FALCON/falcon_kit/mains/ovlp_filter.py
index 63d2de7..dd1a6ba 100644
--- a/FALCON/falcon_kit/mains/ovlp_filter.py
+++ b/FALCON/falcon_kit/mains/ovlp_filter.py
@@ -15,11 +15,9 @@ def filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len):
         def ignore(overlap_data):
             left_count = overlap_data["5p"]
             right_count = overlap_data["3p"]
-            if abs(left_count - right_count) > max_diff:
-                return True
-            elif left_count > max_ovlp or right_count > max_ovlp:
-                return True
-            elif left_count < min_ovlp or right_count < min_ovlp:
+            if (abs(left_count - right_count) > max_diff) or \
+               (left_count > max_ovlp) or (right_count > max_ovlp) or \
+               (left_count < min_ovlp) or (right_count < min_ovlp):
                 return True
 
         ignore_rtn = []
diff --git a/FALCON/falcon_kit/mains/ovlp_to_graph.py b/FALCON/falcon_kit/mains/ovlp_to_graph.py
index 40dcd12..f8d6829 100644
--- a/FALCON/falcon_kit/mains/ovlp_to_graph.py
+++ b/FALCON/falcon_kit/mains/ovlp_to_graph.py
@@ -1,4 +1,3 @@
-#from pbcore.io import FastaReader
 import networkx as nx
 import os
 import shlex
@@ -1043,7 +1042,7 @@ def identify_simple_paths(sg2, edge_data):
         for v,w in free_edges:
             if (reverse_end(w), reverse_end(v) ) not in free_edges:
                 print "bug", v,w
-                print oreverse_end(w), reverse_end(v)
+                print reverse_end(w), reverse_end(v)
 
     while free_edges:
         if s_nodes:
diff --git a/FALCON/falcon_kit/mains/run1.py b/FALCON/falcon_kit/mains/run1.py
index 9b7bee6..247001a 100644
--- a/FALCON/falcon_kit/mains/run1.py
+++ b/FALCON/falcon_kit/mains/run1.py
@@ -1,6 +1,7 @@
 from .. import run_support as support
 from .. import bash, pype_tasks
-from ..util.system import only_these_symlinks
+from ..util.system import (only_these_symlinks, lfs_setstripe_maybe)
+# pylint: disable=no-name-in-module, import-error, fixme, line-too-long
 from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
         makePypeLocalFile, fn, PypeTask)
 import argparse
@@ -13,7 +14,7 @@ import sys
 import time
 
 
-fc_run_logger = logging.getLogger(__name__) # default, for remote tasks
+LOG = logging.getLogger(__name__) # default, for remote tasks
 
 
 def create_daligner_tasks(basedir, scatter_fn):
@@ -111,14 +112,14 @@ def create_consensus_gather_task(wd, inputs):
 
 
 def main1(prog_name, input_config_fn, logger_config_fn=None):
-    global fc_run_logger
-    fc_run_logger = support.setup_logger(logger_config_fn)
+    global LOG
+    LOG = support.setup_logger(logger_config_fn)
 
-    fc_run_logger.info('fc_run started with configuration %s', input_config_fn)
+    LOG.info('fc_run started with configuration %s', input_config_fn)
     try:
         config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn))
     except Exception:
-        fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn))
+        LOG.exception('Failed to parse config "{}".'.format(input_config_fn))
         raise
     input_fofn_plf = makePypeLocalFile(config['input_fofn'])
     genome_size = config.get('genome_size')
@@ -143,7 +144,7 @@ def run(wf, config,
         ):
     """
     Preconditions (for now):
-    * fc_run_logger
+    * LOG
     * run_support.logger
     """
     rawread_dir = os.path.abspath('./0-rawreads')
@@ -156,20 +157,21 @@ def run(wf, config,
         support.make_dirs(d)
 
     exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
-    concurrent_jobs = config['pa_concurrent_jobs']
-    wf.max_jobs = concurrent_jobs
+    wf.max_jobs = config['default_concurrent_jobs']
 
-    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn'])))
-    make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
-                                  outputs = {'o_fofn': rawread_fofn_plf},
-                                  parameters = {},
-    )
-    fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)
-
-    wf.addTasks([fofn_abs_task])
-    wf.refreshTargets([fofn_abs_task])
+    assert config['input_type'] in ('raw', 'preads'), 'Invalid input_type=={!r}'.format(config['input_type'])
 
     if config['input_type'] == 'raw':
+        rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn'])))
+        make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
+                                    outputs = {'o_fofn': rawread_fofn_plf},
+                                    parameters = {},
+        )
+        fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)
+
+        wf.addTasks([fofn_abs_task])
+        wf.refreshTargets([fofn_abs_task])
+
         #### import sequences into daligner DB
         sleep_done = makePypeLocalFile( os.path.join( rawread_dir, 'sleep_done') )
         rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, 'rdb_build_done') )
@@ -196,6 +198,7 @@ def run(wf, config,
 
         raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
         #### run daligner
+        wf.max_jobs = config['da_concurrent_jobs']
         scattered_plf = os.path.join(rawread_dir, 'daligner-scatter', 'scattered.json')
         make_daligner_scatter = PypeTask(
                 inputs = {
@@ -234,6 +237,7 @@ def run(wf, config,
         wf.refreshTargets(exitOnFailure=exitOnFailure)
 
         # Merge .las files.
+        wf.max_jobs = config['la_concurrent_jobs']
         scattered_plf = os.path.join(rawread_dir, 'merge-scatter', 'scattered.json')
         make_task = PypeTask(
                 inputs = {
@@ -262,8 +266,7 @@ def run(wf, config,
             sys.exit(0)
 
         # Produce new FOFN of preads fasta, based on consensus of overlaps.
-        concurrent_jobs = config['cns_concurrent_jobs']
-        wf.max_jobs = concurrent_jobs
+        wf.max_jobs = config['cns_concurrent_jobs']
 
         scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json')
         make_task = PypeTask(
@@ -308,13 +311,13 @@ def run(wf, config,
 
 
     if config['target'] == 'pre-assembly':
-        log.info('Quitting after stage-0 for "pre-assembly" target.')
+        LOG.info('Quitting after stage-0 for "pre-assembly" target.')
         sys.exit(0)
 
     # build pread database
     if config['input_type'] == 'preads':
         preads_fofn_plf = makePypeLocalFile(os.path.join(pread_dir, 'preads-fofn-abs', os.path.basename(config['input_fofn'])))
-        make_fofn_abs_task = PypeTask(inputs = {'i_fofn': rawread_fofn_plf},
+        make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
                                      outputs = {'o_fofn': preads_fofn_plf},
                                      parameters = {},
         )
@@ -345,9 +348,7 @@ def run(wf, config,
 
     preads_nblock = support.get_nblock(fn(preads_db))
     #### run daligner
-    concurrent_jobs = config['ovlp_concurrent_jobs']
-    wf.max_jobs = concurrent_jobs
-
+    wf.max_jobs = config['pda_concurrent_jobs']
     config['sge_option_da'] = config['sge_option_pda']
 
     scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json')
@@ -387,6 +388,7 @@ def run(wf, config,
     wf.refreshTargets(exitOnFailure=exitOnFailure)
 
     # Merge .las files.
+    wf.max_jobs = config['pla_concurrent_jobs']
     config['sge_option_la'] = config['sge_option_pla']
     scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json')
     make_task = PypeTask(
@@ -414,6 +416,8 @@ def run(wf, config,
     wf.refreshTargets(exitOnFailure=exitOnFailure)
 
 
+    # Draft assembly (called 'fc_' for now)
+    wf.max_jobs = config['fc_concurrent_jobs']
     db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
     db2falcon_done = makePypeLocalFile(os.path.join(db2falcon_dir, 'db2falcon_done'))
     preads4falcon_plf = makePypeLocalFile(os.path.join(db2falcon_dir, 'preads4falcon.fasta'))
@@ -451,6 +455,7 @@ def run(wf, config,
 
 
 def main(argv=sys.argv):
+    lfs_setstripe_maybe(path='.', stripe=12)
     parser = argparse.ArgumentParser()
     parser.add_argument('config',
         help='.cfg/.ini/.json')
diff --git a/FALCON/falcon_kit/pype_tasks.py b/FALCON/falcon_kit/pype_tasks.py
index 4219388..bfc6f8b 100644
--- a/FALCON/falcon_kit/pype_tasks.py
+++ b/FALCON/falcon_kit/pype_tasks.py
@@ -201,7 +201,9 @@ def task_run_las_merge(self):
         else:
             src = os.path.relpath(las_path, cwd)
         tgt = os.path.join(cwd, os.path.basename(las_path))
-        LOG.debug('symlink {!r} -> {!r}'.format(src, tgt))
+        LOG.debug('symlink {!r} <- {!r}'.format(src, tgt))
+        if os.path.lexists(tgt):
+            os.unlink(tgt)
         os.symlink(src, tgt)
 
     config = self.parameters['config']
diff --git a/FALCON/falcon_kit/run_support.py b/FALCON/falcon_kit/run_support.py
index a41f124..2e21fcd 100644
--- a/FALCON/falcon_kit/run_support.py
+++ b/FALCON/falcon_kit/run_support.py
@@ -68,7 +68,7 @@ def make_job_data(url, script_fn):
     Base job_name on script_fn.
     """
     wd = os.path.dirname(script_fn)
-    job_name = '{0}-{1}-{1}'.format(
+    job_name = '{0}-{1}-{2}'.format(
             os.path.basename(script_fn),
             url.split("/")[-1],
             str(uuid.uuid4())[:8],
@@ -131,6 +131,7 @@ def parse_config(config_fn):
         config.readfp(open(config_fn))
     return config
 
+import warnings
 def get_dict_from_old_falcon_cfg(config):
     job_type = "SGE"
     section = 'General'
@@ -143,7 +144,7 @@ def get_dict_from_old_falcon_cfg(config):
     else:
         sge_option = config.get(section, 'sge_option_da')
 
-    job_queue = "default"
+    job_queue = ""
     if config.has_option(section, 'job_queue'):
         job_queue = config.get(section, 'job_queue')
 
@@ -159,17 +160,33 @@ def get_dict_from_old_falcon_cfg(config):
     if config.has_option(section, 'pwatcher_directory'):
         pwatcher_directory = config.get(section, 'pwatcher_directory')
 
-    pa_concurrent_jobs = default_concurrent_jobs
-    if config.has_option(section, 'pa_concurrent_jobs'):
-        pa_concurrent_jobs = config.getint(section, 'pa_concurrent_jobs')
-
+    da_concurrent_jobs = default_concurrent_jobs
+    la_concurrent_jobs = default_concurrent_jobs
     cns_concurrent_jobs = default_concurrent_jobs
-    if config.has_option(section, 'cns_concurrent_jobs'):
-        cns_concurrent_jobs = config.getint(section, 'cns_concurrent_jobs')
+    pda_concurrent_jobs = default_concurrent_jobs
+    pla_concurrent_jobs = default_concurrent_jobs
+    fc_concurrent_jobs = default_concurrent_jobs
 
-    ovlp_concurrent_jobs = default_concurrent_jobs
+    if config.has_option(section, 'pa_concurrent_jobs'):
+        pa_concurrent_jobs = config.getint(section, 'pa_concurrent_jobs')
+        warnings.warn("Deprecated setting in config: 'pa_concurrent_jobs' -- Prefer da_concurrent_jobs and la_concurrent_jobs separately")
+        da_concurrent_jobs = la_concurrent_jobs = pa_concurrent_jobs
     if config.has_option(section, 'ovlp_concurrent_jobs'):
         ovlp_concurrent_jobs = config.getint(section, 'ovlp_concurrent_jobs')
+        warnings.warn("Deprecated setting in config: 'ovlp_concurrent_jobs' -- Prefer pda_concurrent_jobs and pla_concurrent_jobs separately")
+        pda_concurrent_jobs = pla_concurrent_jobs = ovlp_concurrent_jobs
+    if config.has_option(section, 'da_concurrent_jobs'):
+        da_concurrent_jobs = config.getint(section, 'da_concurrent_jobs')
+    if config.has_option(section, 'la_concurrent_jobs'):
+        la_concurrent_jobs = config.getint(section, 'la_concurrent_jobs')
+    if config.has_option(section, 'cns_concurrent_jobs'):
+        cns_concurrent_jobs = config.getint(section, 'cns_concurrent_jobs')
+    if config.has_option(section, 'pda_concurrent_jobs'):
+        pda_concurrent_jobs = config.getint(section, 'pda_concurrent_jobs')
+    if config.has_option(section, 'pla_concurrent_jobs'):
+        pla_concurrent_jobs = config.getint(section, 'pla_concurrent_jobs')
+    if config.has_option(section, 'fc_concurrent_jobs'):
+        fc_concurrent_jobs = config.getint(section, 'fc_concurrent_jobs')
 
     #appending = False
     #if config.has_option(section, 'appending'):
@@ -245,6 +262,10 @@ def get_dict_from_old_falcon_cfg(config):
     if config.has_option(section, 'falcon_sense_greedy'):
         falcon_sense_greedy = config.getboolean(section, 'falcon_sense_greedy')
 
+    LA4Falcon_preload = ""
+    if config.has_option(section, 'la4falcon_preload'):
+        LA4Falcon_preload = config.getboolean(section, 'la4falcon_preload')
+
     genome_size = 0
     if config.has_option(section, 'genome_size'):
         genome_size = config.getint(section, 'genome_size')
@@ -310,9 +331,13 @@ def get_dict_from_old_falcon_cfg(config):
                    "job_queue" : job_queue,
                    "input_type": input_type,
                    #"openending": openending,
-                   "pa_concurrent_jobs" : pa_concurrent_jobs,
-                   "ovlp_concurrent_jobs" : ovlp_concurrent_jobs,
+                   "default_concurrent_jobs" : default_concurrent_jobs,
+                   "da_concurrent_jobs" : da_concurrent_jobs,
+                   "la_concurrent_jobs" : la_concurrent_jobs,
                    "cns_concurrent_jobs" : cns_concurrent_jobs,
+                   "pda_concurrent_jobs" : pda_concurrent_jobs,
+                   "pla_concurrent_jobs" : pla_concurrent_jobs,
+                   "fc_concurrent_jobs" : fc_concurrent_jobs,
                    "overlap_filtering_setting": overlap_filtering_setting,
                    "genome_size" : genome_size,
                    "seed_coverage" : seed_coverage,
@@ -338,6 +363,7 @@ def get_dict_from_old_falcon_cfg(config):
                    "falcon_sense_option": falcon_sense_option,
                    "falcon_sense_skip_contained": falcon_sense_skip_contained,
                    "falcon_sense_greedy": falcon_sense_greedy,
+                   "LA4Falcon_preload": LA4Falcon_preload,
                    "stop_all_jobs_on_failure": stop_all_jobs_on_failure,
                    "use_tmpdir": use_tmpdir,
                    "pwatcher_type": pwatcher_type,
@@ -347,7 +373,6 @@ def get_dict_from_old_falcon_cfg(config):
     provided = dict(config.items(section))
     unused = set(provided) - set(k.lower() for k in hgap_config)
     if unused:
-        import warnings
         warnings.warn("Unexpected keys in input config: %s" %repr(unused))
 
     hgap_config["install_prefix"] = sys.prefix
diff --git a/FALCON/falcon_kit/stats_preassembly.py b/FALCON/falcon_kit/stats_preassembly.py
index f322b7f..623378a 100644
--- a/FALCON/falcon_kit/stats_preassembly.py
+++ b/FALCON/falcon_kit/stats_preassembly.py
@@ -7,7 +7,7 @@ See FALCON-pbsmrtpipe/pbfalcon/report_preassembly.py for XML version.
 #   http://swarm/files/depot/branches/springfield/S2.3/software/smrtanalysis/bioinformatics/tools/pbreports/pbreports/report/preassembly.py
 from __future__ import absolute_import
 from __future__ import division
-from .FastaReader import FastaReader
+from .FastaReader import open_fasta_reader
 from .util.io import syscall
 from . import functional
 import collections
@@ -21,7 +21,7 @@ import re
 log = logging.getLogger(__name__)
 __version__ = '0.1'
 
-Stats = collections.namedtuple('FastaStats', ['nreads', 'total', 'n50', 'p95'])
+Stats = collections.namedtuple('FastaStats', ['nreads', 'total', 'n50', 'p95', 'esize'])
 
 # Copied from pbreports/util.py
 # We want to avoid a dependency on pbreports b/c it needs matplotlib.
@@ -31,7 +31,7 @@ def get_fasta_readlengths(fasta_file):
     :return: (tuple)
     """
     lens = []
-    with FastaReader(fasta_file) as f:
+    with open_fasta_reader(fasta_file) as f:
         for record in f:
             lens.append(len(record.sequence))
     lens.sort()
@@ -88,11 +88,13 @@ def percentile(read_lens, p):
 def stats_from_sorted_readlengths(read_lens):
     nreads = len(read_lens)
     total = sum(read_lens)
+    sum_squares = sum(r*r for r in read_lens)
     n50 = read_len_above(read_lens, int(total * 0.50))
     p95 = percentile(read_lens, 0.95)
+    esize = sum_squares / total
     #alt_n50 = pbreports.util.compute_n50(read_lens)
     #log.info('our n50=%s, pbreports=%s' %(n50, alt_n50)) # Ours is more correct when median is between 2 reads.
-    return Stats(nreads=nreads, total=total, n50=n50, p95=p95)
+    return Stats(nreads=nreads, total=total, n50=n50, p95=p95, esize=esize)
 
 def read_lens_from_fofn(fofn_fn):
     """Return sorted list.
@@ -152,18 +154,21 @@ def stats_dict(stats_raw_reads, stats_seed_reads, stats_corrected_reads, genome_
     kwds['raw_n50'] = stats_raw_reads.n50
     kwds['raw_p95'] = stats_raw_reads.p95
     kwds['raw_coverage'] = stats_raw_reads.total / genome_length
+    kwds['raw_esize'] = stats_raw_reads.esize
     kwds['seed_reads'] = stats_seed_reads.nreads
     kwds['seed_bases'] = stats_seed_reads.total
     kwds['seed_mean'] = stats_seed_reads.total / stats_seed_reads.nreads
     kwds['seed_n50'] = stats_seed_reads.n50
     kwds['seed_p95'] = stats_seed_reads.p95
     kwds['seed_coverage'] = stats_seed_reads.total / genome_length
+    kwds['seed_esize'] = stats_seed_reads.esize
     kwds['preassembled_reads'] = stats_corrected_reads.nreads
     kwds['preassembled_bases'] = stats_corrected_reads.total
     kwds['preassembled_mean'] = stats_corrected_reads.total / stats_corrected_reads.nreads
     kwds['preassembled_n50'] = stats_corrected_reads.n50
     kwds['preassembled_p95'] = stats_corrected_reads.p95
     kwds['preassembled_coverage'] = stats_corrected_reads.total / genome_length
+    kwds['preassembled_esize'] = stats_corrected_reads.esize
     kwds['preassembled_yield'] = stats_corrected_reads.total / stats_seed_reads.total
     kwds['preassembled_seed_fragmentation'] = fragmentation
     kwds['preassembled_seed_truncation'] = truncation
@@ -178,6 +183,8 @@ def make_dict(
         i_raw_reads_fofn_fn,
         genome_length,
         length_cutoff,
+        fragmentation=-1,
+        truncation=-1,
     ):
     raw_reads = read_lens_from_fofn(i_raw_reads_fofn_fn)
     stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
@@ -193,6 +200,8 @@ def make_dict(
             stats_corrected_reads=stats_preads,
             genome_length=genome_length,
             length_cutoff=length_cutoff,
+            fragmentation=fragmentation,
+            truncation=truncation,
     )
     return report_dict
 
diff --git a/FALCON/falcon_kit/util/system.py b/FALCON/falcon_kit/util/system.py
index 5748a24..2f029d6 100644
--- a/FALCON/falcon_kit/util/system.py
+++ b/FALCON/falcon_kit/util/system.py
@@ -1,3 +1,4 @@
+from .io import system
 import logging
 import os
 import pprint
@@ -30,3 +31,11 @@ def only_these_symlinks(dir2paths):
         for base, rel in base2rel.iteritems():
             path = os.path.join(d, base)
             os.symlink(rel, path)
+
+def lfs_setstripe_maybe(path='.', stripe=12):
+    path = os.path.abspath(path)
+    rc = system('lfs setstripe -c {:d} {!s}'.format(stripe, path))
+    if rc:
+        log.info('Apparently {!r} is not lustre in filesystem.'.format(path))
+    else:
+        log.info('This lfs stripe ({}) should propagate to subdirs of {!r}.'.format(stripe, path))
diff --git a/FALCON/makefile b/FALCON/makefile
new file mode 100644
index 0000000..8b138c4
--- /dev/null
+++ b/FALCON/makefile
@@ -0,0 +1,65 @@
+# Feel free to override this.
+ifndef PYTHONUSERBASE
+  PYTHONUSERBASE:=LOCAL
+  PATH:=${PYTHONUSERBASE}/bin:${PATH}
+  export PYTHONUSERBASE
+  export PATH
+endif
+export COVERAGE_PROCESS_START
+
+MY_TEST_FLAGS?=-v -s
+
+install-edit:
+	pip -v install --user --edit .
+install: wheel
+	pip -v install --user --use-wheel --find-links=dist/ .
+pylint:
+	pylint --errors-only falcon_kit/
+test:
+	python -c 'import falcon_kit; print falcon_kit.falcon'
+	#pip install --user pytest
+	py.test ${MY_TEST_FLAGS} --junit-xml=test.basic.xml test/
+	py.test ${MY_TEST_FLAGS} --junit-xml=test.doctest.xml --doctest-modules falcon_kit/functional.py
+	cp -f test.basic.xml nose.basic.xml
+	cp -f test.doctest.xml nose.doctest.xml
+coverage:
+	make coverage-clean
+	#pip install --user coverage
+	COVERAGE_PROCESS_START=${PWD}/mycoverage.cfg ${MAKE} coverage-actual
+coverage-actual: test
+	ls -larth
+	coverage combine
+	ls -larth
+	coverage xml -o coverage.xml
+	sed -i -e 's at filename="@filename="./@g' coverage.xml
+	coverage report -m
+coverage-clean:
+	rm -f .coverage* coverage.xml
+coverage-install:
+	# This is needed only if you run from a different directory, since ./sitecustomize.py
+	# would not be in 'sys.path'.
+	# Assume PYTHONUSERBASE is set.
+	mkdir -p ${PYTHONUSERBASE}/lib/python2.7/site-packages
+	ln -f mysitecustomize.py ${PYTHONUSERBASE}/lib/python2.7/site-packages/sitecustomize.py
+coverage-uninstall:
+	rm -f ${PYTHONUSERBASE}/lib/python2.7/site-packages/sitecustomize.py*
+
+# We cannot run doctests on *all* modules because some include dependencies.
+# Just pypeFLOW for now, but I would rather not test dependencies anyway.
+
+wheel:
+	pip install --upgrade --user pip
+	python setup.py bdist_wheel
+# Look for dist/*.whl
+
+tar:
+	rm -f FALCON.tar.gz
+	tar cvzf FALCON.tar.gz -C ${PYTHONUSERBASE} .
+# Much smaller than the wheel, and includes all necessary dependencies,
+# but also includes anything already in the user-site.
+
+clean: coverage-clean
+	\rm -f *.xml
+
+
+.PHONY: install test install-no-edit wheel coverage tar clean
diff --git a/FALCON/mycoverage.cfg b/FALCON/mycoverage.cfg
new file mode 100644
index 0000000..9de3ac8
--- /dev/null
+++ b/FALCON/mycoverage.cfg
@@ -0,0 +1,5 @@
+[run]
+branch = True
+data_file = ${PWD}/.coverage
+parallel = True
+source = falcon_kit
diff --git a/FALCON/mysitecustomize.py b/FALCON/mysitecustomize.py
new file mode 100644
index 0000000..e060ab1
--- /dev/null
+++ b/FALCON/mysitecustomize.py
@@ -0,0 +1,3 @@
+#import site; site.addsitedir('...')
+#raise Exception('WHERE')
+import coverage; coverage.process_startup()
diff --git a/FALCON/setup.py b/FALCON/setup.py
index b6f893c..d189809 100755
--- a/FALCON/setup.py
+++ b/FALCON/setup.py
@@ -8,6 +8,7 @@ install_requires=[
         "networkx >=1.7, <=1.10",
         #"logging_tree",
         #"pbcore >= 0.6.3",
+        #"pypeFLOW", # We exclude pypeFLOW because it is not needed for the unit-tests.
         ]
 
 scripts = glob.glob("src/py_scripts/*.py")
diff --git a/FALCON/src/c/falcon.c b/FALCON/src/c/falcon.c
index 11740a1..ce679ac 100755
--- a/FALCON/src/c/falcon.c
+++ b/FALCON/src/c/falcon.c
@@ -147,6 +147,8 @@ align_tags_t * get_align_tags( char * aln_q_seq,
             p_j = j;
             p_jj = jj;
             p_q_base = aln_q_seq[k];
+        } else {
+            break; // when there is a big alignment gap > UINT8_MAX, stop to extned the tagging string
         }
     }
     // sentinal at the end
@@ -316,13 +318,13 @@ consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
     consensus_data * consensus;
     //char * consensus;
     align_tag_t * c_tag;
-    static msa_pos_t * msa_array = NULL;
 
     coverage = calloc( t_len, sizeof(unsigned int) );
     local_nbase = calloc( t_len, sizeof(unsigned int) );
 
 #ifndef STATIC_ALLOCATE
 
+    msa_pos_t * msa_array = NULL; // For more efficiency, this should be injected.
     msa_array = calloc(t_len, sizeof(msa_pos_t *));
 
     for (i = 0; i < t_len; i++) {
@@ -331,10 +333,9 @@ consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
         allocate_delta_group(msa_array[i]);
     }
 
-#endif
-
-#ifdef STATIC_ALLOCATE
+#else
 
+    static msa_pos_t * msa_array = NULL;
     if ( msa_array == NULL) {
         msa_array = get_msa_working_sapce( 100000 );
     }
@@ -547,9 +548,7 @@ consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
     }
 
     free(msa_array);
-#endif
-
-#ifdef STATIC_ALLOCATE
+#else
     clean_msa_working_space(msa_array, t_len+1);
 #endif
 
diff --git a/FALCON/src/py_scripts_v0.1/falcon_asm.py b/FALCON/src/py_scripts_v0.1/falcon_asm.py
deleted file mode 100755
index 0b632e8..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_asm.py
+++ /dev/null
@@ -1,1154 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from pbcore.io import FastaReader
-import networkx as nx
-import os
-import shlex
-import sys
-import subprocess
-
-DEBUG_LOG_LEVEL = 0
-
-class SGNode(object):
-    """
-    class representing a node in the string graph
-    """
-    def __init__(self, node_name):
-        self.name = node_name
-        self.out_edges = []
-        self.in_edges = []
-    def add_out_edge(self, out_edge):
-        self.out_edges.append(out_edge)
-    def add_in_edge(self, in_edge):
-        self.in_edges.append(in_edge)
-
-class SGEdge(object):
-    """
-    class representing an edge in the string graph
-    """
-    def __init__(self, in_node, out_node):
-        self.in_node = in_node
-        self.out_node = out_node
-        self.attr = {}
-    def set_attribute(self, attr, value):
-        self.attr[attr] = value
-
-def reverse_end( node_id ):
-    node_id, end = node_id.split(":")
-    new_end = "B" if end == "E" else "E"
-    return node_id + ":" + new_end
-
-class StringGraph(object):
-    """
-    class representing the string graph
-    """
-    def __init__(self):
-        self.nodes = {}
-        self.edges = {}
-        self.n_mark = {}
-        self.e_reduce = {}
-        self.repeat_overlap = {}
-
-    def add_node(self, node_name):
-        """
-        add a node into the graph by given a node name
-        """
-        if node_name not in self.nodes:
-            self.nodes[node_name] = SGNode(node_name)
-
-    def add_edge(self, in_node_name, out_node_name, **attributes):
-        """
-        add an edge into the graph by given a pair of nodes
-        """
-        if (in_node_name, out_node_name) not in self.edges:
-
-            self.add_node(in_node_name)
-            self.add_node(out_node_name)
-            in_node = self.nodes[in_node_name]
-            out_node = self.nodes[out_node_name]
-
-            edge = SGEdge(in_node, out_node)
-            self.edges[ (in_node_name, out_node_name) ] = edge
-            in_node.add_out_edge(edge)
-            out_node.add_in_edge(edge)
-        edge =  self.edges[ (in_node_name, out_node_name) ]
-        for k, v in attributes.items():
-            edge.attr[k] = v
-
-    def init_reduce_dict(self):
-        for e in self.edges:
-            self.e_reduce[e] = False
-
-    def mark_chimer_edge(self):
-
-        for e_n, e in self.edges.items():
-            v = e_n[0]
-            w = e_n[1]
-            overlap_count = 0
-            for w_out_e in self.nodes[w].out_edges:
-                w_out_n = w_out_e.out_node.name
-                if (v, w_out_n) in self.edges:
-                    overlap_count += 1
-            for v_in_e in self.nodes[v].in_edges:
-                v_in_n = v_in_e.in_node.name
-                if (v_in_n, w) in self.edges:
-                    overlap_count += 1
-            if self.e_reduce[ (v, w) ] != True:
-                if overlap_count == 0:
-                    self.e_reduce[(v, w)] = True
-                    #print "XXX: chimer edge %s %s removed" % (v, w)
-                    v, w = reverse_end(w), reverse_end(v)
-                    self.e_reduce[(v, w)] = True
-                    #print "XXX: chimer edge %s %s removed" % (v, w)
-
-
-
-    def mark_spur_edge(self):
-
-        for  v in self.nodes:
-            if len(self.nodes[v].out_edges) > 1:
-                for out_edge in self.nodes[v].out_edges:
-                    w = out_edge.out_node.name
-
-                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
-                        #print "XXX: spur edge %s %s removed" % (v, w)
-                        self.e_reduce[(v, w)] = True
-                        v2, w2 = reverse_end(w), reverse_end(v)
-                        #print "XXX: spur edge %s %s removed" % (v2, w2)
-                        self.e_reduce[(v, w)] = True
-
-            if len(self.nodes[v].in_edges) > 1:
-                for in_edge in self.nodes[v].in_edges:
-                    w = in_edge.in_node.name
-                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
-                        #print "XXX: spur edge %s %s removed" % (w, v)
-                        self.e_reduce[(w, v)] = True
-                        v2, w2 = reverse_end(w), reverse_end(v)
-                        #print "XXX: spur edge %s %s removed" % (w2, v2)
-                        self.e_reduce[(w, v)] = True
-
-
-    def mark_tr_edges(self):
-        """
-        transitive reduction
-        """
-        n_mark = self.n_mark
-        e_reduce = self.e_reduce
-        FUZZ = 500
-        for n in self.nodes:
-            n_mark[n] = "vacant"
-
-        for n_name, node in self.nodes.items():
-
-            out_edges = node.out_edges
-            if len(out_edges) == 0:
-                continue
-
-            out_edges.sort(key=lambda x: x.attr["length"])
-
-            for e in out_edges:
-                w = e.out_node
-                n_mark[ w.name ] = "inplay"
-
-            max_len = out_edges[-1].attr["length"]
-
-            max_len += FUZZ
-
-            for e in out_edges:
-                e_len = e.attr["length"]
-                w = e.out_node
-                if n_mark[w.name] == "inplay":
-                    w.out_edges.sort( key=lambda x: x.attr["length"] )
-                    for e2 in w.out_edges:
-                        if e2.attr["length"] + e_len < max_len:
-                            x = e2.out_node
-                            if n_mark[x.name] == "inplay":
-                                n_mark[x.name] = "eliminated"
-
-            for e in out_edges:
-                e_len = e.attr["length"]
-                w = e.out_node
-                w.out_edges.sort( key=lambda x: x.attr["length"] )
-                if len(w.out_edges) > 0:
-                    x = w.out_edges[0].out_node
-                    if n_mark[x.name] == "inplay":
-                        n_mark[x.name] = "eliminated"
-                for e2 in w.out_edges:
-                    if e2.attr["length"] < FUZZ:
-                        x = e2.out_node
-                        if n_mark[x.name] == "inplay":
-                            n_mark[x.name] = "eliminated"
-
-            for out_edge in out_edges:
-                v = out_edge.in_node
-                w = out_edge.out_node
-                if n_mark[w.name] == "eliminated":
-                    e_reduce[ (v.name, w.name) ] = True
-                    #print "XXX: tr edge %s %s removed" % (v.name, w.name)
-                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
-                    e_reduce[(v_name, w_name)] = True
-                    #print "XXX: tr edge %s %s removed" % (v_name, w_name)
-                n_mark[w.name] = "vacant"
-
-
-    def mark_best_overlap(self):
-        """
-        find the best overlapped edges
-        """
-
-        best_edges = set()
-
-        for v in self.nodes:
-
-            out_edges = self.nodes[v].out_edges
-            if len(out_edges) > 0:
-                out_edges.sort(key=lambda e: e.attr["score"])
-                e = out_edges[-1]
-                best_edges.add( (e.in_node.name, e.out_node.name) )
-
-            in_edges = self.nodes[v].in_edges
-            if len(in_edges) > 0:
-                in_edges.sort(key=lambda e: e.attr["score"])
-                e = in_edges[-1]
-                best_edges.add( (e.in_node.name, e.out_node.name) )
-
-        if DEBUG_LOG_LEVEL > 1:
-            print "X", len(best_edges)
-
-        for e_n, e in self.edges.items():
-            v = e_n[0]
-            w = e_n[1]
-            if self.e_reduce[ (v, w) ] != True:
-                if (v, w) not in best_edges:
-                    self.e_reduce[(v, w)] = True
-                    #print "XXX: in best edge %s %s removed" % (v, w)
-                    v2, w2 = reverse_end(w), reverse_end(v)
-                    #print "XXX: in best edge %s %s removed" % (v2, w2)
-                    self.e_reduce[(v2, w2)] = True
-
-    def get_out_edges_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].out_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        return rtn
-
-
-    def get_in_edges_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].in_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        return rtn
-
-    def get_best_out_edge_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].out_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        rtn.sort(key=lambda e: e.attr["score"])
-
-        return rtn[-1]
-
-    def get_best_in_edge_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].in_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        rtn.sort(key=lambda e: e.attr["score"])
-        return rtn[-1]
-
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-def generate_seq_from_path(sg, seqs, path):
-    subseqs = []
-    r_id, end = path[0].split(":")
-
-    count = 0
-    for i in range( len( path ) -1 ):
-        w_n, v_n = path[i:i+2]
-        edge = sg.edges[ (w_n, v_n ) ]
-        read_id, coor = edge.attr["label"].split(":")
-        b,e = coor.split("-")
-        b = int(b)
-        e = int(e)
-        if b < e:
-            subseqs.append( seqs[read_id][b:e] )
-        else:
-            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
-
-    return "".join(subseqs)
-
-
-def reverse_path( path ):
-    new_path = []
-    for n in list(path[::-1]):
-        rid, end = n.split(":")
-        new_end = "B" if end == "E" else "E"
-        new_path.append( rid+":"+new_end)
-    return new_path
-
-
-def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
-
-    """
-    given a string graph:sg and the sequences: seqs, write the unitig fasta file into out_fn
-    the funtion return a reduct graph representing the reduce string graph where the edges are unitigs
-
-    some extra files generated:
-        unit_edges.dat : an easy to parse file for unitig data
-        unit_edge_paths : the file contains the information of the path of all unitigs
-        uni_graph.gexf: the unitig graph in gexf format for visulization
-    """
-
-    G = SGToNXG(sg)
-    if connected_nodes != None:
-        connected_nodes = set(sg.nodes)
-    out_fasta = open(out_fn, "w")
-    nodes_for_tig = set()
-    sg_edges = set()
-    for v, w in sg.edges:
-        if sg.e_reduce[(v, w)] != True:
-            sg_edges.add( (v, w) )
-    count = 0
-    edges_in_tigs = set()
-
-    uni_edges = {}
-    path_f = open("unit_edge_paths","w")
-    uni_edge_f = open("unit_edges.dat", "w")
-    while len(sg_edges) > 0:
-        v, w = sg_edges.pop()
-
-        #nodes_for_tig.remove(n)
-        upstream_nodes = []
-
-        c_node = v
-        p_in_edges = sg.get_in_edges_for_node(c_node)
-        p_out_edges = sg.get_out_edges_for_node(c_node)
-        while len(p_in_edges) == 1 and len(p_out_edges) == 1:
-            p_node = p_in_edges[0].in_node
-            upstream_nodes.append(p_node.name)
-            if (p_node.name, c_node) not in  sg_edges:
-                break
-            p_in_edges = sg.get_in_edges_for_node(p_node.name)
-            p_out_edges = sg.get_out_edges_for_node(p_node.name)
-            c_node = p_node.name
-
-        upstream_nodes.reverse()
-
-        downstream_nodes = []
-        c_node = w
-        n_out_edges = sg.get_out_edges_for_node(c_node)
-        n_in_edges = sg.get_in_edges_for_node(c_node)
-        while len(n_out_edges) == 1 and len(n_in_edges) == 1:
-            n_node = n_out_edges[0].out_node
-            downstream_nodes.append(n_node.name)
-            if (c_node, n_node.name) not in  sg_edges:
-                break
-            n_out_edges = sg.get_out_edges_for_node(n_node.name)
-            n_in_edges = sg.get_in_edges_for_node(n_node.name)
-            c_node = n_node.name
-
-        whole_path = upstream_nodes + [v, w] + downstream_nodes
-        count += 1
-        subseq = generate_seq_from_path(sg, seqs, whole_path)
-        uni_edges.setdefault( (whole_path[0], whole_path[-1]), [] )
-        uni_edges[(whole_path[0], whole_path[-1])].append(  ( whole_path, subseq ) )
-        print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), subseq
-        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, whole_path[0], whole_path[-1], len(whole_path), " ".join(whole_path))
-        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, whole_path[0], whole_path[-1], len(whole_path))
-        print >>out_fasta, subseq
-        for i in range( len( whole_path ) -1 ):
-            w_n, v_n = whole_path[i:i+2]
-            try:
-                sg_edges.remove( (w_n, v_n) )
-            except KeyError: #if an edge is already deleted, ignore it
-                pass
-
-        r_whole_path = reverse_path( whole_path )
-        count += 1
-        subseq = generate_seq_from_path(sg, seqs, r_whole_path)
-        uni_edges.setdefault( (r_whole_path[0], r_whole_path[-1]), [] )
-        uni_edges[(r_whole_path[0], r_whole_path[-1])].append(  ( r_whole_path, subseq ) )
-        print >> uni_edge_f, r_whole_path[0], r_whole_path[-1], "-".join(r_whole_path), subseq
-        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path), " ".join(r_whole_path))
-        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path))
-        print >>out_fasta, subseq
-        for i in range( len( r_whole_path ) -1 ):
-            w_n, v_n = r_whole_path[i:i+2]
-            try:
-                sg_edges.remove( (w_n, v_n) )
-            except KeyError: #if an edge is already deleted, ignore it
-                pass
-
-
-    path_f.close()
-    uni_edge_f.close()
-    #uni_graph = nx.DiGraph()
-    #for n1, n2 in uni_edges.keys():
-    #    uni_graph.add_edge(n1, n2, count = len( uni_edges[ (n1,n2) ] ))
-    #nx.write_gexf(uni_graph, "uni_graph.gexf")
-
-    out_fasta.close()
-    return uni_edges
-
-def neighbor_bound(G, v, w, radius):
-    """
-    test if the node v and the node w are connected within a radius in graph G
-    """
-    g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
-    g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
-    if len(set(g1.edges()) & set(g2.edges())) > 0:
-        return True
-    else:
-        return False
-
-
-def is_branch_node(G, n):
-    """
-    test whether the node n is a "branch node" which the paths from any of two of
-    its offsprings do not intersect within a given radius
-    """
-    out_edges = G.out_edges([n])
-    n2 = [ e[1] for e in out_edges ]
-    is_branch = False
-    for i in range(len(n2)):
-        for j in range(i+1, len(n2)):
-            v = n2[i]
-            w = n2[j]
-            if neighbor_bound(G, v, w, 10) == False:
-                is_branch = True
-                break
-        if is_branch == True:
-            break
-    return is_branch
-
-
-def get_bundle( path, u_graph ):
-
-    """
-    find a sub-graph contain the nodes between the start and the end of the path
-    inputs:
-        u_graph : a unitig graph
-    returns:
-        bundle_graph: the whole bundle graph
-        bundle_paths: the paths in the bundle graph
-        sub_graph2_edges: all edges of the bundle graph
-
-    """
-
-    p_start, p_end = path[0], path[-1]
-    p_nodes = set(path)
-    p_edges = set(zip(path[:-1], path[1:]))
-
-    u_graph_r = u_graph.reverse()
-    down_path = nx.ego_graph(u_graph, p_start, radius=len(p_nodes), undirected=False)
-    up_path = nx.ego_graph(u_graph_r, p_end, radius=len(p_nodes), undirected=False)
-    subgraph_nodes = set(down_path) & set(up_path)
-
-
-    sub_graph = nx.DiGraph()
-    for v, w in u_graph.edges_iter():
-        if v in subgraph_nodes and w in subgraph_nodes:
-            if (v, w) in p_edges:
-                sub_graph.add_edge(v, w, color = "red")
-            else:
-                sub_graph.add_edge(v, w, color = "black")
-
-    sub_graph2 = nx.DiGraph()
-    tips = set()
-    tips.add(path[0])
-    sub_graph_r = sub_graph.reverse()
-    visited = set()
-    ct = 0
-    is_branch = is_branch_node(sub_graph, path[0]) #if the start node is a branch node
-    if is_branch:
-        n = tips.pop()
-        e = sub_graph.out_edges([n])[0] #pick one path the build the subgraph
-        sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
-        if e[1] not in visited:
-            last_node = e[1]
-            visited.add(e[1])
-            r_id, orientation = e[1].split(":")
-            orientation = "E" if orientation == "B" else "E"
-            visited.add( r_id +":" + orientation)
-            if not is_branch_node(sub_graph_r, e[1]):
-                tips.add(e[1])
-
-    while len(tips) != 0:
-        n = tips.pop()
-        out_edges = sub_graph.out_edges([n])
-        if len(out_edges) == 1:
-            e = out_edges[0]
-            sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
-            last_node = e[1]
-            if e[1] not in visited:
-                visited.add(e[1])
-                r_id, orientation = e[1].split(":")
-                orientation = "E" if orientation == "B" else "E"
-                visited.add( r_id +":" + orientation)
-                if not is_branch_node(sub_graph_r, e[1]):
-                    tips.add(e[1])
-        else:
-
-            is_branch = is_branch_node(sub_graph, n)
-            if not is_branch:
-                for e in out_edges:
-                    sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
-                    last_node = e[1]
-                    if e[1] not in visited:
-                        r_id, orientation = e[1].split(":")
-                        visited.add(e[1])
-                        orientation = "E" if orientation == "B" else "E"
-                        visited.add( r_id +":" + orientation)
-                        if not is_branch_node(sub_graph_r, e[1]):
-                            tips.add(e[1])
-        ct += 1
-    last_node = None
-    longest_len = 0
-
-    sub_graph2_nodes = sub_graph2.nodes()
-    sub_graph2_edges = sub_graph2.edges()
-
-
-    new_path = [path[0]]
-    for n in sub_graph2_nodes:
-        if len(sub_graph2.out_edges(n)) == 0 :
-            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
-            path_len = len(path_t)
-            if path_len > longest_len:
-                last_node = n
-                longest_len = path_len
-                new_path = path_t
-
-    if last_node == None:
-        for n in sub_graph2_nodes:
-            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
-            path_len = len(path_t)
-            if path_len > longest_len:
-                last_node = n
-                longest_len = path_len
-                new_path = path_t
-
-
-    path = new_path
-
-    # clean up sub_graph2 according to new begin and end
-    sub_graph2_r = sub_graph2.reverse()
-    down_path = nx.ego_graph(sub_graph2, path[0], radius=len(path), undirected=False)
-    up_path = nx.ego_graph(sub_graph2_r, path[-1], radius=len(path), undirected=False)
-    subgraph_nodes = set(down_path) & set(up_path)
-    for v in sub_graph2_nodes:
-        if v not in subgraph_nodes:
-            sub_graph2.remove_node(v)
-
-    if DEBUG_LOG_LEVEL > 1:
-        print "new_path", path[0], last_node, len(sub_graph2_nodes), path
-
-
-    bundle_paths = [path]
-    p_nodes = set(path)
-    p_edges = set(zip(path[:-1], path[1:]))
-
-    sub_graph2_nodes = sub_graph2.nodes()
-    sub_graph2_edges = sub_graph2.edges()
-
-    nodes_idx = dict( [ (n[1], n[0]) for n in enumerate(path) ]  )
-
-
-    # create a list of subpath that has no branch
-    non_branch_subpaths = []
-    wi = 0
-    vi = 0
-    v = path[0]
-    while v != path[-1] and wi < len(path)-1:
-        wi += 1
-        w = path[wi]
-        while len( sub_graph2.successors(w) ) == 1 and len( sub_graph2.predecessors(w) ) == 1 and wi < len(path)-1:
-            wi += 1
-            w = path[wi]
-        if  len( sub_graph2.successors(v) )!= 1 or len( sub_graph2.predecessors(w) )!= 1:
-            branched = True
-        else:
-            branched = False
-
-        if not branched:
-            non_branch_subpaths.append( path[vi:wi+1] )
-        v = w
-        vi = wi
-
-    # create the accompany_graph that has the path of the alternative subpaths
-
-    associate_graph = nx.DiGraph()
-    for v, w in sub_graph2.edges_iter():
-        if (v, w) not in p_edges:
-            associate_graph.add_edge(v, w, n_weight = sub_graph2[v][w]["n_weight"])
-
-    if DEBUG_LOG_LEVEL > 1:
-        print "associate_graph size:", len(associate_graph)
-        print "non_branch_subpaths",len(non_branch_subpaths), non_branch_subpaths
-
-    # construct the bundle graph
-    associate_graph_nodes = set(associate_graph.nodes())
-    bundle_graph = nx.DiGraph()
-    bundle_graph.add_path( path )
-    for i in range(len(non_branch_subpaths)-1):
-        if len(non_branch_subpaths[i]) == 0 or len( non_branch_subpaths[i+1] ) == 0:
-            continue
-        e1, e2 = non_branch_subpaths[i: i+2]
-        v = e1[-1]
-        w = e2[0]
-        if v == w:
-            continue
-        in_between_node_count = nodes_idx[w] - nodes_idx[v]
-        if v in associate_graph_nodes and w in associate_graph_nodes:
-            try:
-                a_path = nx.shortest_path(associate_graph, v, w, "n_weight")
-            except nx.NetworkXNoPath:
-                continue
-            bundle_graph.add_path( a_path )
-            bundle_paths.append( a_path )
-
-    return bundle_graph, bundle_paths, sub_graph2_edges
-
-def get_bundles(u_edges):
-
-    """
-    input: all unitig edges
-    output: the assembled primary_tigs.fa and all_tigs.fa
-    """
-
-    ASM_graph = nx.DiGraph()
-    out_f = open("primary_tigs.fa", "w")
-    main_tig_paths = open("primary_tigs_paths","w")
-    sv_tigs = open("all_tigs.fa","w")
-    sv_tig_paths = open("all_tigs_paths","w")
-    max_weight = 0
-    for v, w in u_edges:
-        x = max( [len(s[1]) for s in u_edges[ (v,w) ] ] )
-        if DEBUG_LOG_LEVEL > 1:
-            print "W", v, w, x
-        if x > max_weight:
-            max_weight = x
-
-    in_edges = {}
-    out_edges = {}
-    for v, w in u_edges:
-        in_edges.setdefault(w, [])
-        out_edges.setdefault(w, [])
-        in_edges[w].append( (v, w) )
-
-        out_edges.setdefault(v, [])
-        in_edges.setdefault(v, [])
-        out_edges[v].append( (v, w) )
-
-    u_graph = nx.DiGraph()
-    for v,w in u_edges:
-
-        u_graph.add_edge(v, w, n_weight = max_weight - max( [len(s[1]) for s in  u_edges[ (v,w) ] ] ) )
-
-    bundle_edge_out = open("bundle_edges","w")
-    bundle_index = 0
-    G = u_graph.copy()
-    visited_u_edges = set()
-    while len(G) > 0:
-
-        root_nodes = set()
-        for n in G:
-            if G.in_degree(n) == 0:
-                root_nodes.add(n)
-
-        if len(root_nodes) == 0:
-            if G.in_degree(n) != 1:
-                root_nodes.add(n)
-
-        if len(root_nodes) == 0:
-            root_nodes.add( G.nodes()[0] )
-
-        candidates = []
-
-        for n in list(root_nodes):
-            sp =nx.single_source_shortest_path_length(G, n)
-            sp = sp.items()
-            sp.sort(key=lambda x : x[1])
-            longest = sp[-1]
-            if DEBUG_LOG_LEVEL > 2:
-                print "L", n, longest[0]
-            if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop
-                continue
-            candidates.append ( (longest[1], n, longest[0]) )
-
-        if len(candidates) == 0:
-            print "no more candiate", len(G.edges()), len(G.nodes())
-            if len(G.edges()) > 0:
-                path = G.edges()[0]
-                print path
-            else:
-                break
-        else:
-            candidates.sort()
-
-            candidate = candidates[-1]
-
-            if candidate[1] == candidate[2]:
-                G.remove_node(candidate[1])
-                continue
-
-            path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight")
-
-        if DEBUG_LOG_LEVEL > 1:
-            print "X", path[0], path[-1], len(path)
-
-        cmp_edges = set()
-        g_edges = set(G.edges())
-        new_path = []
-        tail = True
-        # avioid confusion due to long palindrome sequence
-        if len(path) > 2:
-            for i in range( 0, len( path ) - 1 ):
-                v_n, w_n = path[i:i+2]
-                new_path.append(v_n)
-                # the comment out code below might be useful for filter out some high connectivity nodes
-                #if (v_n, w_n) in cmp_edges or\
-                #    len(u_graph.out_edges(w_n)) > 5 or\
-                #    len(u_graph.in_edges(w_n)) > 5:
-                if (v_n, w_n) in cmp_edges:
-                    tail = False
-                    break
-
-                r_id, end = v_n.split(":")
-                end = "E" if end == "B" else "B"
-                v_n2 = r_id + ":" + end
-
-                r_id, end = w_n.split(":")
-                end = "E" if end == "B" else "B"
-                w_n2 = r_id + ":" + end
-
-                if (w_n2, v_n2) in g_edges:
-                    cmp_edges.add( (w_n2, v_n2) )
-
-            if tail:
-                new_path.append(w_n)
-        else:
-            new_path = path[:]
-
-
-        if len(new_path) > 1:
-            path = new_path
-
-            if DEBUG_LOG_LEVEL > 2:
-                print "Y", path[0], path[-1], len(path)
-
-            bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G )
-            for bg_edge in bundle_graph_edges:
-                print >> bundle_edge_out, bundle_index, "edge", bg_edge[0], bg_edge[1]
-            for path_ in bundle_paths:
-                print >>bundle_edge_out, "path", bundle_index, " ".join(path_)
-
-            edges_to_be_removed = set()
-            if DEBUG_LOG_LEVEL > 2:
-                print "Z", bundle_paths[0][0], bundle_paths[0][-1]
-                print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
-
-            if len(bundle_graph_edges) > 0:
-
-                ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
-                extra_u_edges = []
-
-                print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
-                subseqs = []
-
-                for i in range(len(bundle_paths[0]) - 1):
-                    v, w = bundle_paths[0][i:i+2]
-                    edges_to_be_removed.add( (v,w) )
-                    uedges = u_edges[ (v,w) ]
-                    uedges.sort( key= lambda x: len(x[0]) )
-                    subseqs.append( uedges[-1][1] )
-                    visited_u_edges.add( "-".join(uedges[-1][0]) )
-                    for ue in uedges:
-                        if "-".join(ue[0]) not in visited_u_edges:
-                            visited_u_edges.add("-".join(ue[0]))
-                            extra_u_edges.append(ue)
-                seq = "".join(subseqs)
-                sv_tig_idx = 0
-                print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(bundle_paths[0]) )
-                if len(seq) > 0:
-                    print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
-                    print >> out_f, seq
-                    print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, bundle_paths[0][0], bundle_paths[0][-1])
-                    print >> sv_tigs, "".join(subseqs)
-
-                sv_tig_idx += 1
-
-                for sv_path in bundle_paths[1:]:
-                    print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
-                    ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
-                    subseqs = []
-                    for i in range(len(sv_path) - 1):
-                        v, w = sv_path[i:i+2]
-                        edges_to_be_removed.add( (v,w) )
-                        uedges = u_edges[ (v,w) ]
-                        uedges.sort( key= lambda x: len(x[0]) )
-                        subseqs.append( uedges[-1][1] )
-                        visited_u_edges.add( "-".join(uedges[-1][0]) )
-                        for ue in uedges:
-                            if "-".join(ue[0]) not in visited_u_edges:
-                                visited_u_edges.add("-".join(ue[0]))
-                                extra_u_edges.append(ue)
-                    seq = "".join(subseqs)
-                    if len(seq) > 0:
-                        print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
-                        print >> sv_tigs, "".join(subseqs)
-                    sv_tig_idx += 1
-                for u_path, seq in extra_u_edges:
-                    #u_path = u_path.split("-")
-                    ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
-                    print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
-                    print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
-                    print >> sv_tigs, seq
-                    sv_tig_idx += 1
-
-
-                bundle_index += 1
-        else:
-            #TODO, consolidate code here
-            v,w = path
-            uedges = u_edges[ (v,w) ]
-            uedges.sort( key= lambda x: len(x[0]) )
-            subseqs.append( uedges[-1][1] )
-            seq = "".join(subseqs)
-            print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(paths) )
-            print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, path[0], path[-1])
-            print >> sv_tigs, seq
-            sv_tig_idx += 1
-            bundle_index += 1
-            bundle_graph_edges = zip(path[:-1],path[1:])
-
-        #clean up the graph
-
-        edges = set(G.edges())
-        edges_to_be_removed |= set(bundle_graph_edges)
-
-        if DEBUG_LOG_LEVEL > 2:
-            print "BGE",bundle_graph_edges
-
-        edge_remove_count = 0
-        for v, w in edges_to_be_removed:
-            if (v, w) in edges:
-                G.remove_edge( v, w )
-                edge_remove_count += 1
-                if DEBUG_LOG_LEVEL > 2:
-                    print "remove edge", bundle_index, w, v
-
-        edges = set(G.edges())
-        for v, w in edges_to_be_removed:
-
-            r_id, end = v.split(":")
-            end = "E" if end == "B" else "B"
-            v = r_id + ":" + end
-
-            r_id, end = w.split(":")
-            end = "E" if end == "B" else "B"
-            w = r_id + ":" + end
-
-            if (w, v) in edges:
-                G.remove_edge( w, v )
-                edge_remove_count += 1
-                if DEBUG_LOG_LEVEL > 2:
-                    print "remove edge", bundle_index, w, v
-
-        if edge_remove_count == 0:
-            break
-
-        nodes = G.nodes()
-        for n in nodes:
-            if G.in_degree(n) == 0 and G.out_degree(n) == 0:
-                G.remove_node(n)
-                if DEBUG_LOG_LEVEL > 2:
-                    print "remove node", n
-
-    sv_tig_paths.close()
-    sv_tigs.close()
-    main_tig_paths.close()
-    out_f.close()
-    bundle_edge_out.close()
-    return ASM_graph
-
-
-
-def SGToNXG(sg):
-    G=nx.DiGraph()
-
-    max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
-    out_f = open("edges_list","w")
-    for v, w in sg.edges:
-        if sg.e_reduce[(v, w)] != True:
-        ##if 1:
-            out_degree = len(sg.nodes[v].out_edges)
-            G.add_node( v, size = out_degree )
-            G.add_node( w, size = out_degree )
-            label = sg.edges[ (v, w) ].attr["label"]
-            score = sg.edges[ (v, w) ].attr["score"]
-            print >>out_f, v, w, label, score
-            G.add_edge( v, w, label = label, weight = 0.001*score, n_weight = max_score - score )
-            #print in_node_name, out_node_name
-    out_f.close()
-    return G
-
-if __name__ == "__main__":
-
-    import argparse
-
-    parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
-    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
-    parser.add_argument('read_fasta', help='the file that contains the sequence to be assembled')
-    parser.add_argument('--min_len', type=int, default=4000,
-                        help='minimum length of the reads to be considered for assembling')
-    parser.add_argument('--min_idt', type=float, default=96,
-                        help='minimum alignment identity of the reads to be considered for assembling')
-    parser.add_argument('--disable_chimer_prediction', action="store_true", default=False,
-                        help='you may want to disable this as some reads can be falsely identified as chimers in low coverage case')
-
-    args = parser.parse_args()
-
-
-    overlap_file = args.overlap_file
-    read_fasta = args.read_fasta
-
-    seqs = {}
-    # load all p-reads into memory
-    f = FastaReader(read_fasta)
-    for r in f:
-        seqs[r.name] = r.sequence.upper()
-
-    G=nx.Graph()
-    edges =set()
-    overlap_data = []
-    contained_reads = set()
-    overlap_count = {}
-
-
-    # loop through the overlapping data to load the data in the a python array
-    # contained reads are identified
-
-    with open(overlap_file) as f:
-        for l in f:
-            l = l.strip().split()
-
-            #work around for some ill formed data recored
-            if len(l) != 13:
-                continue
-
-            f_id, g_id, score, identity = l[:4]
-            if f_id == g_id:  # don't need self-self overlapping
-                continue
-
-            if g_id not in seqs:
-                continue
-
-            if f_id not in seqs:
-                continue
-
-            score = int(score)
-            identity = float(identity)
-            contained = l[12]
-            if contained == "contained":
-                contained_reads.add(f_id)
-                continue
-            if contained == "contains":
-                contained_reads.add(g_id)
-                continue
-            if contained == "none":
-                continue
-
-            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
-                continue
-            #if score > -2000:
-            #    continue
-            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
-            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
-
-            # only used reads longer than the 4kb for assembly
-            if f_len < args.min_len: continue
-            if g_len < args.min_len: continue
-
-            # double check for proper overlap
-            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
-                continue
-
-            if g_start > 24 and g_len - g_end > 24:
-                continue
-
-            if g_strain == 0:
-                if f_start < 24 and g_len - g_end > 24:
-                    continue
-                if g_start < 24 and f_len - f_end > 24:
-                    continue
-            else:
-                if f_start < 24 and g_start > 24:
-                    continue
-                if g_start < 24 and f_start > 24:
-                    continue
-
-            overlap_data.append( (f_id, g_id, score, identity,
-                                  f_strain, f_start, f_end, f_len,
-                                  g_strain, g_start, g_end, g_len) )
-
-            overlap_count[f_id] = overlap_count.get(f_id,0)+1
-            overlap_count[g_id] = overlap_count.get(g_id,0)+1
-
-    overlap_set = set()
-    sg = StringGraph()
-    for od in overlap_data:
-        f_id, g_id, score, identity = od[:4]
-        if f_id in contained_reads:
-            continue
-        if g_id in contained_reads:
-            continue
-        f_s, f_b, f_e, f_l = od[4:8]
-        g_s, g_b, g_e, g_l = od[8:12]
-        overlap_pair = [f_id, g_id]
-        overlap_pair.sort()
-        overlap_pair = tuple( overlap_pair )
-        if overlap_pair in overlap_set:  # don't allow duplicated records
-            continue
-        else:
-            overlap_set.add(overlap_pair)
-
-
-        if g_s == 1: # revered alignment, swapping the begin and end coordinates
-            g_b, g_e = g_e, g_b
-
-        # build the string graph edges for each overlap
-        if f_b > 24:
-            if g_b < g_e:
-                """
-                     f.B         f.E
-                  f  ----------->
-                  g         ------------->
-                            g.B           g.E
-                """
-                if f_b == 0 or g_e - g_l == 0:
-                    continue
-                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
-                                                           length = abs(f_b-0),
-                                                           score = -score)
-                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_e, g_l),
-                                                           length = abs(g_e-g_l),
-                                                           score = -score)
-            else:
-                """
-                     f.B         f.E
-                  f  ----------->
-                  g         <-------------
-                            g.E           g.B
-                """
-                if f_b == 0 or g_e == 0:
-                    continue
-                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
-                                                           length = abs(f_b -0),
-                                                           score = -score)
-                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_e, 0),
-                                                           length = abs(g_e- 0),
-                                                           score = -score)
-        else:
-            if g_b < g_e:
-                """
-                                    f.B         f.E
-                  f                 ----------->
-                  g         ------------->
-                            g.B           g.E
-                """
-                if g_b == 0 or f_e - f_l == 0:
-                    continue
-                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_b, 0),
-                                                           length = abs(g_b - 0),
-                                                           score = -score)
-                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
-                                                           length = abs(f_e-f_l),
-                                                           score = -score)
-            else:
-                """
-                                    f.B         f.E
-                  f                 ----------->
-                  g         <-------------
-                            g.E           g.B
-                """
-                if g_b - g_l == 0 or f_e - f_l ==0:
-                    continue
-                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_b, g_l),
-                                                           length = abs(g_b - g_l),
-                                                           score = -score)
-                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
-                                                           length = abs(f_e - f_l),
-                                                           score = -score)
-
-
-    sg.init_reduce_dict()
-    if not args.disable_chimer_prediction:
-        sg.mark_chimer_edge()
-    sg.mark_spur_edge()
-    sg.mark_tr_edges() # mark those edges that transitive redundant
-
-    if DEBUG_LOG_LEVEL > 1:
-        print sum( [1 for c in sg.e_reduce.values() if c == True] )
-        print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-    sg.mark_best_overlap() # mark those edges that are best overlap edges
-
-    if DEBUG_LOG_LEVEL > 1:
-        print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-
-    G = SGToNXG(sg)
-    #nx.write_gexf(G, "string_graph.gexf") # output the raw string string graph for visuliation
-    nx.write_adjlist(G, "string_graph.adj") # write out the whole adjacent list of the string graph
-
-    u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa") # reduct to string graph into unitig graph
-    ASM_graph = get_bundles(u_edges )  # get the assembly
-    #nx.write_gexf(ASM_graph, "asm_graph.gexf")
diff --git a/FALCON/src/py_scripts_v0.1/falcon_asm_s.py b/FALCON/src/py_scripts_v0.1/falcon_asm_s.py
deleted file mode 100755
index 5041881..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_asm_s.py
+++ /dev/null
@@ -1,1220 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from pbcore.io import FastaReader
-import networkx as nx
-import os
-import shlex
-import sys
-import subprocess
-
-DEBUG_LOG_LEVEL = 0
-
-class SGNode(object):
-    """
-    class representing a node in the string graph
-    """
-    def __init__(self, node_name):
-        self.name = node_name
-        self.out_edges = []
-        self.in_edges = []
-    def add_out_edge(self, out_edge):
-        self.out_edges.append(out_edge)
-    def add_in_edge(self, in_edge):
-        self.in_edges.append(in_edge)
-
-class SGEdge(object):
-    """
-    class representing an edge in the string graph
-    """
-    def __init__(self, in_node, out_node):
-        self.in_node = in_node
-        self.out_node = out_node
-        self.attr = {}
-    def set_attribute(self, attr, value):
-        self.attr[attr] = value
-
-def reverse_end( node_id ):
-    node_id, end = node_id.split(":")
-    new_end = "B" if end == "E" else "E"
-    return node_id + ":" + new_end
-
-class StringGraph(object):
-    """
-    class representing the string graph
-    """
-    def __init__(self):
-        self.nodes = {}
-        self.edges = {}
-        self.n_mark = {}
-        self.e_reduce = {}
-        self.repeat_overlap = {}
-
-    def add_node(self, node_name):
-        """
-        add a node into the graph by given a node name
-        """
-        if node_name not in self.nodes:
-            self.nodes[node_name] = SGNode(node_name)
-
-    def add_edge(self, in_node_name, out_node_name, **attributes):
-        """
-        add an edge into the graph by given a pair of nodes
-        """
-        if (in_node_name, out_node_name) not in self.edges:
-
-            self.add_node(in_node_name)
-            self.add_node(out_node_name)
-            in_node = self.nodes[in_node_name]
-            out_node = self.nodes[out_node_name]
-
-            edge = SGEdge(in_node, out_node)
-            self.edges[ (in_node_name, out_node_name) ] = edge
-            in_node.add_out_edge(edge)
-            out_node.add_in_edge(edge)
-        edge =  self.edges[ (in_node_name, out_node_name) ]
-        for k, v in attributes.items():
-            edge.attr[k] = v
-
-    def init_reduce_dict(self):
-        for e in self.edges:
-            self.e_reduce[e] = False
-
-    def mark_chimer_edge(self):
-
-        for e_n, e in self.edges.items():
-            v = e_n[0]
-            w = e_n[1]
-            overlap_count = 0
-            for w_out_e in self.nodes[w].out_edges:
-                w_out_n = w_out_e.out_node.name
-                if (v, w_out_n) in self.edges:
-                    overlap_count += 1
-            for v_in_e in self.nodes[v].in_edges:
-                v_in_n = v_in_e.in_node.name
-                if (v_in_n, w) in self.edges:
-                    overlap_count += 1
-            if self.e_reduce[ (v, w) ] != True:
-                if overlap_count == 0:
-                    self.e_reduce[(v, w)] = True
-                    #print "XXX: chimer edge %s %s removed" % (v, w)
-                    v, w = reverse_end(w), reverse_end(v)
-                    self.e_reduce[(v, w)] = True
-                    #print "XXX: chimer edge %s %s removed" % (v, w)
-
-
-
-    def mark_spur_edge(self):
-
-        for  v in self.nodes:
-            if len(self.nodes[v].out_edges) > 1:
-                for out_edge in self.nodes[v].out_edges:
-                    w = out_edge.out_node.name
-
-                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
-                        #print "XXX: spur edge %s %s removed" % (v, w)
-                        self.e_reduce[(v, w)] = True
-                        v2, w2 = reverse_end(w), reverse_end(v)
-                        #print "XXX: spur edge %s %s removed" % (v2, w2)
-                        self.e_reduce[(v, w)] = True
-
-            if len(self.nodes[v].in_edges) > 1:
-                for in_edge in self.nodes[v].in_edges:
-                    w = in_edge.in_node.name
-                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
-                        #print "XXX: spur edge %s %s removed" % (w, v)
-                        self.e_reduce[(w, v)] = True
-                        v2, w2 = reverse_end(w), reverse_end(v)
-                        #print "XXX: spur edge %s %s removed" % (w2, v2)
-                        self.e_reduce[(w, v)] = True
-
-
-    def mark_tr_edges(self):
-        """
-        transitive reduction
-        """
-        n_mark = self.n_mark
-        e_reduce = self.e_reduce
-        FUZZ = 500
-        for n in self.nodes:
-            n_mark[n] = "vacant"
-
-        for n_name, node in self.nodes.items():
-
-            out_edges = node.out_edges
-            if len(out_edges) == 0:
-                continue
-
-            out_edges.sort(key=lambda x: x.attr["length"])
-
-            for e in out_edges:
-                w = e.out_node
-                n_mark[ w.name ] = "inplay"
-
-            max_len = out_edges[-1].attr["length"]
-
-            max_len += FUZZ
-
-            for e in out_edges:
-                e_len = e.attr["length"]
-                w = e.out_node
-                if n_mark[w.name] == "inplay":
-                    w.out_edges.sort( key=lambda x: x.attr["length"] )
-                    for e2 in w.out_edges:
-                        if e2.attr["length"] + e_len < max_len:
-                            x = e2.out_node
-                            if n_mark[x.name] == "inplay":
-                                n_mark[x.name] = "eliminated"
-
-            for e in out_edges:
-                e_len = e.attr["length"]
-                w = e.out_node
-                w.out_edges.sort( key=lambda x: x.attr["length"] )
-                if len(w.out_edges) > 0:
-                    x = w.out_edges[0].out_node
-                    if n_mark[x.name] == "inplay":
-                        n_mark[x.name] = "eliminated"
-                for e2 in w.out_edges:
-                    if e2.attr["length"] < FUZZ:
-                        x = e2.out_node
-                        if n_mark[x.name] == "inplay":
-                            n_mark[x.name] = "eliminated"
-
-            for out_edge in out_edges:
-                v = out_edge.in_node
-                w = out_edge.out_node
-                if n_mark[w.name] == "eliminated":
-                    e_reduce[ (v.name, w.name) ] = True
-                    #print "XXX: tr edge %s %s removed" % (v.name, w.name)
-                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
-                    e_reduce[(v_name, w_name)] = True
-                    #print "XXX: tr edge %s %s removed" % (v_name, w_name)
-                n_mark[w.name] = "vacant"
-
-
-    def mark_best_overlap(self):
-        """
-        find the best overlapped edges
-        """
-
-        best_edges = set()
-
-        for v in self.nodes:
-
-            out_edges = self.nodes[v].out_edges
-            if len(out_edges) > 0:
-                out_edges.sort(key=lambda e: e.attr["score"])
-                e = out_edges[-1]
-                best_edges.add( (e.in_node.name, e.out_node.name) )
-
-            in_edges = self.nodes[v].in_edges
-            if len(in_edges) > 0:
-                in_edges.sort(key=lambda e: e.attr["score"])
-                e = in_edges[-1]
-                best_edges.add( (e.in_node.name, e.out_node.name) )
-
-        if DEBUG_LOG_LEVEL > 1:
-            print "X", len(best_edges)
-
-        for e_n, e in self.edges.items():
-            v = e_n[0]
-            w = e_n[1]
-            if self.e_reduce[ (v, w) ] != True:
-                if (v, w) not in best_edges:
-                    self.e_reduce[(v, w)] = True
-                    #print "XXX: in best edge %s %s removed" % (v, w)
-                    v2, w2 = reverse_end(w), reverse_end(v)
-                    #print "XXX: in best edge %s %s removed" % (v2, w2)
-                    self.e_reduce[(v2, w2)] = True
-
-    def get_out_edges_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].out_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        return rtn
-
-
-    def get_in_edges_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].in_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        return rtn
-
-    def get_best_out_edge_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].out_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        rtn.sort(key=lambda e: e.attr["score"])
-
-        return rtn[-1]
-
-    def get_best_in_edge_for_node(self, name, mask=True):
-        rtn = []
-        for e in self.nodes[name].in_edges:
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[ (v.name, w.name) ] == False:
-                rtn.append(e)
-        rtn.sort(key=lambda e: e.attr["score"])
-        return rtn[-1]
-
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-def generate_seq_from_path(sg, seqs, path):
-    subseqs = []
-    r_id, end = path[0].split(":")
-
-    count = 0
-    for i in range( len( path ) -1 ):
-        w_n, v_n = path[i:i+2]
-        edge = sg.edges[ (w_n, v_n ) ]
-        read_id, coor = edge.attr["label"].split(":")
-        b,e = coor.split("-")
-        b = int(b)
-        e = int(e)
-        if b < e:
-            subseqs.append( seqs[read_id][b:e] )
-        else:
-            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
-
-    return "".join(subseqs)
-
-
-def reverse_path( path ):
-    new_path = []
-    for n in list(path[::-1]):
-        rid, end = n.split(":")
-        new_end = "B" if end == "E" else "E"
-        new_path.append( rid+":"+new_end)
-    return new_path
-
-
-def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
-
-    """
-    given a string graph:sg and the sequences: seqs, write the unitig fasta file into out_fn
-    the funtion return a reduct graph representing the reduce string graph where the edges are unitigs
-
-    some extra files generated:
-        unit_edges.dat : an easy to parse file for unitig data
-        unit_edge_paths : the file contains the information of the path of all unitigs
-        uni_graph.gexf: the unitig graph in gexf format for visulization
-    """
-
-    G = SGToNXG(sg)
-    if connected_nodes != None:
-        connected_nodes = set(sg.nodes)
-    out_fasta = open(out_fn, "w")
-    nodes_for_tig = set()
-    sg_edges = set()
-    for v, w in sg.edges:
-        if sg.e_reduce[(v, w)] != True:
-            sg_edges.add( (v, w) )
-    count = 0
-    edges_in_tigs = set()
-
-    uni_edges = {}
-    path_f = open("unit_edge_paths","w")
-    uni_edge_f = open("unit_edges.dat", "w")
-    while len(sg_edges) > 0:
-        v, w = sg_edges.pop()
-
-        #nodes_for_tig.remove(n)
-        upstream_nodes = []
-
-        c_node = v
-        p_in_edges = sg.get_in_edges_for_node(c_node)
-        p_out_edges = sg.get_out_edges_for_node(c_node)
-        while len(p_in_edges) == 1 and len(p_out_edges) == 1:
-            p_node = p_in_edges[0].in_node
-            upstream_nodes.append(p_node.name)
-            if (p_node.name, c_node) not in  sg_edges:
-                break
-            p_in_edges = sg.get_in_edges_for_node(p_node.name)
-            p_out_edges = sg.get_out_edges_for_node(p_node.name)
-            c_node = p_node.name
-
-        upstream_nodes.reverse()
-
-        downstream_nodes = []
-        c_node = w
-        n_out_edges = sg.get_out_edges_for_node(c_node)
-        n_in_edges = sg.get_in_edges_for_node(c_node)
-        while len(n_out_edges) == 1 and len(n_in_edges) == 1:
-            n_node = n_out_edges[0].out_node
-            downstream_nodes.append(n_node.name)
-            if (c_node, n_node.name) not in  sg_edges:
-                break
-            n_out_edges = sg.get_out_edges_for_node(n_node.name)
-            n_in_edges = sg.get_in_edges_for_node(n_node.name)
-            c_node = n_node.name
-
-        whole_path = upstream_nodes + [v, w] + downstream_nodes
-        count += 1
-        subseq = generate_seq_from_path(sg, seqs, whole_path)
-        #subseq = ""
-        uni_edges.setdefault( (whole_path[0], whole_path[-1]), [] )
-        uni_edges[(whole_path[0], whole_path[-1])].append(  ( whole_path, subseq ) )
-        print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), subseq
-        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, whole_path[0], whole_path[-1], len(whole_path), " ".join(whole_path))
-        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, whole_path[0], whole_path[-1], len(whole_path))
-        print >>out_fasta, subseq
-        for i in range( len( whole_path ) -1 ):
-            w_n, v_n = whole_path[i:i+2]
-            try:
-                sg_edges.remove( (w_n, v_n) )
-            except KeyError: #if an edge is already deleted, ignore it
-                pass
-
-        r_whole_path = reverse_path( whole_path )
-        count += 1
-        subseq = generate_seq_from_path(sg, seqs, r_whole_path)
-        #subseq = ""
-        uni_edges.setdefault( (r_whole_path[0], r_whole_path[-1]), [] )
-        uni_edges[(r_whole_path[0], r_whole_path[-1])].append(  ( r_whole_path, subseq ) )
-        print >> uni_edge_f, r_whole_path[0], r_whole_path[-1], "-".join(r_whole_path), subseq
-        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path), " ".join(r_whole_path))
-        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path))
-        print >>out_fasta, subseq
-        for i in range( len( r_whole_path ) -1 ):
-            w_n, v_n = r_whole_path[i:i+2]
-            try:
-                sg_edges.remove( (w_n, v_n) )
-            except KeyError: #if an edge is already deleted, ignore it
-                pass
-
-
-    path_f.close()
-    uni_edge_f.close()
-    #uni_graph = nx.DiGraph()
-    #for n1, n2 in uni_edges.keys():
-    #    uni_graph.add_edge(n1, n2, count = len( uni_edges[ (n1,n2) ] ))
-    #nx.write_gexf(uni_graph, "uni_graph.gexf")
-
-    out_fasta.close()
-    return uni_edges
-
-def neighbor_bound(G, v, w, radius):
-    """
-    test if the node v and the node w are connected within a radius in graph G
-    """
-    g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
-    g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
-    if len(set(g1.edges()) & set(g2.edges())) > 0:
-        return True
-    else:
-        return False
-
-
-def is_branch_node(G, n):
-    """
-    test whether the node n is a "branch node" which the paths from any of two of
-    its offsprings do not intersect within a given radius
-    """
-    out_edges = G.out_edges([n])
-    n2 = [ e[1] for e in out_edges ]
-    is_branch = False
-    for i in range(len(n2)):
-        for j in range(i+1, len(n2)):
-            v = n2[i]
-            w = n2[j]
-            if neighbor_bound(G, v, w, 10) == False:
-                is_branch = True
-                break
-        if is_branch == True:
-            break
-    return is_branch
-
-
-def get_bundle( path, u_graph, u_graph_r ):
-
-    """
-    find a sub-graph contain the nodes between the start and the end of the path
-    inputs:
-        u_graph : a unitig graph
-    returns:
-        bundle_graph: the whole bundle graph
-        bundle_paths: the paths in the bundle graph
-        sub_graph2_edges: all edges of the bundle graph
-
-    """
-
-    p_start, p_end = path[0], path[-1]
-    p_nodes = set(path)
-    p_edges = set(zip(path[:-1], path[1:]))
-
-    down_path = nx.ego_graph(u_graph, p_start, radius=len(p_nodes), undirected=False)
-    up_path = nx.ego_graph(u_graph_r, p_end, radius=len(p_nodes), undirected=False)
-    subgraph_nodes = set(down_path) & set(up_path)
-
-
-    sub_graph = nx.DiGraph()
-    for v, w in u_graph.edges_iter():
-        if v in subgraph_nodes and w in subgraph_nodes:
-            if (v, w) in p_edges:
-                sub_graph.add_edge(v, w, color = "red")
-            else:
-                sub_graph.add_edge(v, w, color = "black")
-
-    sub_graph2 = nx.DiGraph()
-    tips = set()
-    tips.add(path[0])
-    sub_graph_r = sub_graph.reverse()
-    visited = set()
-    ct = 0
-    is_branch = is_branch_node(sub_graph, path[0]) #if the start node is a branch node
-    if is_branch:
-        n = tips.pop()
-        e = sub_graph.out_edges([n])[0] #pick one path the build the subgraph
-        sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
-        if e[1] not in visited:
-            last_node = e[1]
-            visited.add(e[1])
-            r_id, orientation = e[1].split(":")
-            orientation = "E" if orientation == "B" else "E"
-            visited.add( r_id +":" + orientation)
-            if not is_branch_node(sub_graph_r, e[1]):
-                tips.add(e[1])
-
-    while len(tips) != 0:
-        n = tips.pop()
-        out_edges = sub_graph.out_edges([n])
-        if len(out_edges) == 1:
-            e = out_edges[0]
-            sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
-            last_node = e[1]
-            if e[1] not in visited:
-                visited.add(e[1])
-                r_id, orientation = e[1].split(":")
-                orientation = "E" if orientation == "B" else "E"
-                visited.add( r_id +":" + orientation)
-                if not is_branch_node(sub_graph_r, e[1]):
-                    tips.add(e[1])
-        else:
-
-            is_branch = is_branch_node(sub_graph, n)
-            if not is_branch:
-                for e in out_edges:
-                    sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
-                    last_node = e[1]
-                    if e[1] not in visited:
-                        r_id, orientation = e[1].split(":")
-                        visited.add(e[1])
-                        orientation = "E" if orientation == "B" else "E"
-                        visited.add( r_id +":" + orientation)
-                        if not is_branch_node(sub_graph_r, e[1]):
-                            tips.add(e[1])
-        ct += 1
-    last_node = None
-    longest_len = 0
-
-    sub_graph2_nodes = sub_graph2.nodes()
-    sub_graph2_edges = sub_graph2.edges()
-
-
-    new_path = [path[0]]
-    for n in sub_graph2_nodes:
-        if len(sub_graph2.out_edges(n)) == 0 :
-            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
-            path_len = len(path_t)
-            if path_len > longest_len:
-                last_node = n
-                longest_len = path_len
-                new_path = path_t
-
-    if last_node == None:
-        for n in sub_graph2_nodes:
-            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
-            path_len = len(path_t)
-            if path_len > longest_len:
-                last_node = n
-                longest_len = path_len
-                new_path = path_t
-
-
-    path = new_path
-
-    # clean up sub_graph2 according to new begin and end
-    sub_graph2_r = sub_graph2.reverse()
-    down_path = nx.ego_graph(sub_graph2, path[0], radius=len(path), undirected=False)
-    up_path = nx.ego_graph(sub_graph2_r, path[-1], radius=len(path), undirected=False)
-    subgraph_nodes = set(down_path) & set(up_path)
-    for v in sub_graph2_nodes:
-        if v not in subgraph_nodes:
-            sub_graph2.remove_node(v)
-
-    if DEBUG_LOG_LEVEL > 1:
-        print "new_path", path[0], last_node, len(sub_graph2_nodes), path
-
-
-    bundle_paths = [path]
-    p_nodes = set(path)
-    p_edges = set(zip(path[:-1], path[1:]))
-
-    sub_graph2_nodes = sub_graph2.nodes()
-    sub_graph2_edges = sub_graph2.edges()
-
-    nodes_idx = dict( [ (n[1], n[0]) for n in enumerate(path) ]  )
-
-
-    # create a list of subpath that has no branch
-    non_branch_subpaths = []
-    wi = 0
-    vi = 0
-    v = path[0]
-    while v != path[-1] and wi < len(path)-1:
-        wi += 1
-        w = path[wi]
-        while len( sub_graph2.successors(w) ) == 1 and len( sub_graph2.predecessors(w) ) == 1 and wi < len(path)-1:
-            wi += 1
-            w = path[wi]
-        if  len( sub_graph2.successors(v) )!= 1 or len( sub_graph2.predecessors(w) )!= 1:
-            branched = True
-        else:
-            branched = False
-
-        if not branched:
-            non_branch_subpaths.append( path[vi:wi+1] )
-        v = w
-        vi = wi
-
-    # create the accompany_graph that has the path of the alternative subpaths
-
-    associate_graph = nx.DiGraph()
-    for v, w in sub_graph2.edges_iter():
-        if (v, w) not in p_edges:
-            associate_graph.add_edge(v, w, n_weight = sub_graph2[v][w]["n_weight"])
-
-    if DEBUG_LOG_LEVEL > 1:
-        print "associate_graph size:", len(associate_graph)
-        print "non_branch_subpaths",len(non_branch_subpaths), non_branch_subpaths
-
-    # construct the bundle graph
-    associate_graph_nodes = set(associate_graph.nodes())
-    bundle_graph = nx.DiGraph()
-    bundle_graph.add_path( path )
-    for i in range(len(non_branch_subpaths)-1):
-        if len(non_branch_subpaths[i]) == 0 or len( non_branch_subpaths[i+1] ) == 0:
-            continue
-        e1, e2 = non_branch_subpaths[i: i+2]
-        v = e1[-1]
-        w = e2[0]
-        if v == w:
-            continue
-        in_between_node_count = nodes_idx[w] - nodes_idx[v]
-        if v in associate_graph_nodes and w in associate_graph_nodes:
-            try:
-                a_path = nx.shortest_path(associate_graph, v, w, "n_weight")
-            except nx.NetworkXNoPath:
-                continue
-            bundle_graph.add_path( a_path )
-            bundle_paths.append( a_path )
-
-    return bundle_graph, bundle_paths, sub_graph2_edges
-
-def get_bundles(u_edges):
-
-    """
-    input: all unitig edges
-    output: the assembled primary_tigs.fa and all_tigs.fa
-    """
-
-    ASM_graph = nx.DiGraph()
-    out_f = open("primary_tigs.fa", "w")
-    main_tig_paths = open("primary_tigs_paths","w")
-    sv_tigs = open("all_tigs.fa","w")
-    sv_tig_paths = open("all_tigs_paths","w")
-    max_weight = 0
-    for v, w in u_edges:
-        x = max( [len(s[1]) for s in u_edges[ (v,w) ] ] )
-        if DEBUG_LOG_LEVEL > 1:
-            print "W", v, w, x
-        if x > max_weight:
-            max_weight = x
-
-    in_edges = {}
-    out_edges = {}
-    for v, w in u_edges:
-        in_edges.setdefault(w, [])
-        out_edges.setdefault(w, [])
-        in_edges[w].append( (v, w) )
-
-        out_edges.setdefault(v, [])
-        in_edges.setdefault(v, [])
-        out_edges[v].append( (v, w) )
-
-    u_graph = nx.DiGraph()
-    for v,w in u_edges:
-
-        u_graph.add_edge(v, w, n_weight = max_weight - max( [len(s[1]) for s in  u_edges[ (v,w) ] ] ) )
-
-    bundle_edge_out = open("bundle_edges","w")
-    bundle_index = 0
-
-
-    components = nx.weakly_connected_component_subgraphs(u_graph)
-    components = [ (len(c), c) for c in components ]
-    components.sort()
-    #components.reverse()
-    allS = len(u_graph)
-    ssG = 0.0
-    processed_overlaps = set()
-    for sG, G in components:
-
-        ssG += sG
-        print "process graph of size ", sG, "%0.2f %0.2f" % (ssG, ssG/allS)
-        G_edges = set(G.edges())
-
-        dual_component = False
-
-        for v, w in list(G_edges):
-            v = v.split(":")[0]
-            w = w.split(":")[0]
-            if (v, w) in processed_overlaps:
-                dual_component = True
-                break
-
-        if dual_component == True:
-            continue
-
-        for v, w in list(G_edges):
-            v = v.split(":")[0]
-            w = w.split(":")[0]
-            processed_overlaps.add( (v,w) )
-            processed_overlaps.add( (w,v) )
-
-        G_r = G.reverse()
-        visited_u_edges = set()
-
-        while len(G) > 0:
-            out_f.flush()
-            main_tig_paths.flush()
-            sv_tigs.flush()
-            sv_tig_paths.flush()
-
-
-            #root_nodes = set()
-            candidates = []
-            for n in G:
-                sp =nx.single_source_shortest_path_length(G, n)
-                sp = sp.items()
-                sp.sort(key=lambda x : x[1])
-                longest = sp[-1]
-                if DEBUG_LOG_LEVEL > 2:
-                    print "L", n, longest[0]
-                if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop
-                    continue
-                candidates.append ( (longest[1], n, longest[0]) )
-
-                n = longest[0]
-                sp =nx.single_source_shortest_path_length(G_r, n)
-                sp = sp.items()
-                sp.sort(key=lambda x : x[1])
-                longest = sp[-1]
-                if DEBUG_LOG_LEVEL > 2:
-                    print "L", n, longest[0]
-                if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop
-                    continue
-                candidates.append ( (longest[1], longest[0], n) )
-                if len(candidates) != 0:
-                    break
-
-            if len(candidates) == 0:
-                print "no more candiate", len(G.edges()), len(G.nodes())
-                if len(G_edges) > 0:
-                    path = G_edges.pop()
-                    G_edges.add(path)
-                    print path
-                else:
-                    break
-            else:
-                candidates.sort()
-
-                candidate = candidates[-1]
-
-                if candidate[1] == candidate[2]:
-                    G.remove_node(candidate[1])
-                    G_r.remove_node(candidate[1])
-                    continue
-
-                path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight")
-
-            if DEBUG_LOG_LEVEL > 1:
-                print "X", path[0], path[-1], len(path)
-
-            cmp_edges = set()
-            #g_edges = set(G.edges())
-            new_path = []
-            tail = True
-            # avioid confusion due to long palindrome sequence
-            if len(path) > 2:
-                for i in range( 0, len( path ) - 1 ):
-                    v_n, w_n = path[i:i+2]
-                    new_path.append(v_n)
-                    # the comment out code below might be useful for filter out some high connectivity nodes
-                    #if (v_n, w_n) in cmp_edges or\
-                    #    len(u_graph.out_edges(w_n)) > 5 or\
-                    #    len(u_graph.in_edges(w_n)) > 5:
-                    if (v_n, w_n) in cmp_edges:
-                        tail = False
-                        break
-
-                    r_id, end = v_n.split(":")
-                    end = "E" if end == "B" else "B"
-                    v_n2 = r_id + ":" + end
-
-                    r_id, end = w_n.split(":")
-                    end = "E" if end == "B" else "B"
-                    w_n2 = r_id + ":" + end
-
-                    if (w_n2, v_n2) in G_edges:
-                        cmp_edges.add( (w_n2, v_n2) )
-
-                if tail:
-                    new_path.append(w_n)
-            else:
-                new_path = path[:]
-
-
-            if len(new_path) > 1:
-                path = new_path
-
-                if DEBUG_LOG_LEVEL > 2:
-                    print "Y", path[0], path[-1], len(path)
-
-                bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G, G_r )
-                for bg_edge in bundle_graph_edges:
-                    print >> bundle_edge_out, bundle_index, "edge", bg_edge[0], bg_edge[1]
-                for path_ in bundle_paths:
-                    print >>bundle_edge_out, "path", bundle_index, " ".join(path_)
-
-                edges_to_be_removed = set()
-                if DEBUG_LOG_LEVEL > 2:
-                    print "Z", bundle_paths[0][0], bundle_paths[0][-1]
-                    print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
-
-                if len(bundle_graph_edges) > 0:
-
-                    ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
-                    extra_u_edges = []
-
-                    print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
-                    subseqs = []
-
-                    for i in range(len(bundle_paths[0]) - 1):
-                        v, w = bundle_paths[0][i:i+2]
-                        edges_to_be_removed.add( (v,w) )
-                        uedges = u_edges[ (v,w) ]
-                        uedges.sort( key= lambda x: len(x[0]) )
-                        subseqs.append( uedges[-1][1] )
-                        visited_u_edges.add( "-".join(uedges[-1][0]) )
-                        for ue in uedges:
-                            if "-".join(ue[0]) not in visited_u_edges:
-                                visited_u_edges.add("-".join(ue[0]))
-                                extra_u_edges.append(ue)
-                    seq = "".join(subseqs)
-                    sv_tig_idx = 0
-                    print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(bundle_paths[0]) )
-                    if len(seq) > 0:
-                        print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
-                        print >> out_f, seq
-                        print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, bundle_paths[0][0], bundle_paths[0][-1])
-                        print >> sv_tigs, "".join(subseqs)
-
-                    sv_tig_idx += 1
-
-                    for sv_path in bundle_paths[1:]:
-                        print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
-                        ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
-                        subseqs = []
-                        for i in range(len(sv_path) - 1):
-                            v, w = sv_path[i:i+2]
-                            edges_to_be_removed.add( (v,w) )
-                            uedges = u_edges[ (v,w) ]
-                            uedges.sort( key= lambda x: len(x[0]) )
-                            subseqs.append( uedges[-1][1] )
-                            visited_u_edges.add( "-".join(uedges[-1][0]) )
-                            for ue in uedges:
-                                if "-".join(ue[0]) not in visited_u_edges:
-                                    visited_u_edges.add("-".join(ue[0]))
-                                    extra_u_edges.append(ue)
-                        seq = "".join(subseqs)
-                        if len(seq) > 0:
-                            print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
-                            print >> sv_tigs, "".join(subseqs)
-                        sv_tig_idx += 1
-                    for u_path, seq in extra_u_edges:
-                        #u_path = u_path.split("-")
-                        ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
-                        print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
-                        print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
-                        print >> sv_tigs, seq
-                        sv_tig_idx += 1
-
-
-                    bundle_index += 1
-            else:
-                #TODO, consolidate code here
-                v,w = path
-                uedges = u_edges[ (v,w) ]
-                uedges.sort( key= lambda x: len(x[0]) )
-                subseqs.append( uedges[-1][1] )
-                seq = "".join(subseqs)
-                print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(paths) )
-                print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, path[0], path[-1])
-                print >> sv_tigs, seq
-                sv_tig_idx += 1
-                bundle_index += 1
-                bundle_graph_edges = zip(path[:-1],path[1:])
-
-            #clean up the graph
-
-            edges = set(G.edges())
-            edges_to_be_removed |= set(bundle_graph_edges)
-
-            if DEBUG_LOG_LEVEL > 2:
-                print "BGE",bundle_graph_edges
-
-            edge_remove_count = 0
-            for v, w in edges_to_be_removed:
-                if (v, w) in edges:
-                    G.remove_edge( v, w )
-                    G_r.remove_edge( w, v )
-                    G_edges.remove( (v, w) )
-                    edge_remove_count += 1
-                    if DEBUG_LOG_LEVEL > 2:
-                        print "remove edge", bundle_index, w, v
-
-            edges = set(G.edges())
-            for v, w in edges_to_be_removed:
-
-                r_id, end = v.split(":")
-                end = "E" if end == "B" else "B"
-                v = r_id + ":" + end
-
-                r_id, end = w.split(":")
-                end = "E" if end == "B" else "B"
-                w = r_id + ":" + end
-
-                if (w, v) in edges:
-                    G.remove_edge( w, v )
-                    G_edges.remove( (w, v) )
-                    G_r.remove_edge( v, w )
-                    edge_remove_count += 1
-                    if DEBUG_LOG_LEVEL > 2:
-                        print "remove edge", bundle_index, w, v
-
-            if edge_remove_count == 0:
-                break
-
-            nodes = G.nodes()
-            for n in nodes:
-                if G.in_degree(n) == 0 and G.out_degree(n) == 0:
-                    G.remove_node(n)
-                    G_r.remove_node(n)
-                    if DEBUG_LOG_LEVEL > 2:
-                        print "remove node", n
-
-    sv_tig_paths.close()
-    sv_tigs.close()
-    main_tig_paths.close()
-    out_f.close()
-    bundle_edge_out.close()
-    return ASM_graph
-
-
-
-def SGToNXG(sg):
-    G=nx.DiGraph()
-
-    max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
-    out_f = open("edges_list","w")
-    for v, w in sg.edges:
-        if sg.e_reduce[(v, w)] != True:
-        ##if 1:
-            out_degree = len(sg.nodes[v].out_edges)
-            G.add_node( v, size = out_degree )
-            G.add_node( w, size = out_degree )
-            label = sg.edges[ (v, w) ].attr["label"]
-            score = sg.edges[ (v, w) ].attr["score"]
-            print >>out_f, v, w, label, score
-            G.add_edge( v, w, label = label, weight = 0.001*score, n_weight = max_score - score )
-            #print in_node_name, out_node_name
-    out_f.close()
-    return G
-
-if __name__ == "__main__":
-
-    import argparse
-
-    parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
-    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
-    parser.add_argument('read_fasta', help='the file that contains the sequence to be assembled')
-    parser.add_argument('--min_len', type=int, default=4000,
-                        help='minimum length of the reads to be considered for assembling')
-    parser.add_argument('--min_idt', type=float, default=96,
-                        help='minimum alignment identity of the reads to be considered for assembling')
-    parser.add_argument('--disable_chimer_prediction', action="store_true", default=False,
-                        help='you may want to disable this as some reads can be falsely identified as chimers in low coverage case')
-
-    args = parser.parse_args()
-
-
-    overlap_file = args.overlap_file
-    read_fasta = args.read_fasta
-
-    contained_reads = set()
-    chimer_ids = set()
-
-    with open("rc_out_all") as f:
-        for l in f:
-            l = l.strip().split()
-            if l[1] == "2":
-                chimer_ids.add(l[0])
-            if l[1] == "1":
-                contained_reads.add(l[0])
-    print len(chimer_ids)
-
-    seqs = {}
-    # load all p-reads into memory
-    f = FastaReader(read_fasta)
-    for r in f:
-        if r.name in contained_reads:
-            continue
-        if r.name in chimer_ids:
-            continue
-        seqs[r.name] = r.sequence.upper()
-
-    G=nx.Graph()
-    edges =set()
-    overlap_data = []
-    contained_reads = set()
-    overlap_count = {}
-
-
-    # loop through the overlapping data to load the data in the a python array
-    # contained reads are identified
-
-    with open(overlap_file) as f:
-        for l in f:
-            l = l.strip().split()
-
-            #work around for some ill formed data recored
-            if len(l) != 13:
-                continue
-
-            f_id, g_id, score, identity = l[:4]
-            if f_id == g_id:  # don't need self-self overlapping
-                continue
-
-            if g_id not in seqs:
-                continue
-
-            if f_id not in seqs:
-                continue
-
-            score = int(score)
-            identity = float(identity)
-            contained = l[12]
-            if contained == "contained":
-                contained_reads.add(f_id)
-                continue
-            if contained == "contains":
-                contained_reads.add(g_id)
-                continue
-            if contained == "none":
-                continue
-
-            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
-                continue
-            #if score > -2000:
-            #    continue
-            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
-            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
-
-            # only used reads longer than the 4kb for assembly
-            if f_len < args.min_len: continue
-            if g_len < args.min_len: continue
-
-            # double check for proper overlap
-            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
-                continue
-
-            if g_start > 24 and g_len - g_end > 24:
-                continue
-
-            if g_strain == 0:
-                if f_start < 24 and g_len - g_end > 24:
-                    continue
-                if g_start < 24 and f_len - f_end > 24:
-                    continue
-            else:
-                if f_start < 24 and g_start > 24:
-                    continue
-                if g_start < 24 and f_start > 24:
-                    continue
-
-            overlap_data.append( (f_id, g_id, score, identity,
-                                  f_strain, f_start, f_end, f_len,
-                                  g_strain, g_start, g_end, g_len) )
-
-            overlap_count[f_id] = overlap_count.get(f_id,0)+1
-            overlap_count[g_id] = overlap_count.get(g_id,0)+1
-
-    print "###", len(overlap_data), len(contained_reads)
-    overlap_set = set()
-    sg = StringGraph()
-    for od in overlap_data:
-        f_id, g_id, score, identity = od[:4]
-        if f_id in contained_reads:
-            continue
-        if g_id in contained_reads:
-            continue
-        f_s, f_b, f_e, f_l = od[4:8]
-        g_s, g_b, g_e, g_l = od[8:12]
-        overlap_pair = [f_id, g_id]
-        overlap_pair.sort()
-        overlap_pair = tuple( overlap_pair )
-        if overlap_pair in overlap_set:  # don't allow duplicated records
-            continue
-        else:
-            overlap_set.add(overlap_pair)
-
-
-        if g_s == 1: # revered alignment, swapping the begin and end coordinates
-            g_b, g_e = g_e, g_b
-
-        # build the string graph edges for each overlap
-        if f_b > 24:
-            if g_b < g_e:
-                """
-                     f.B         f.E
-                  f  ----------->
-                  g         ------------->
-                            g.B           g.E
-                """
-                if f_b == 0 or g_e - g_l == 0:
-                    continue
-                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
-                                                           length = abs(f_b-0),
-                                                           score = -score)
-                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_e, g_l),
-                                                           length = abs(g_e-g_l),
-                                                           score = -score)
-            else:
-                """
-                     f.B         f.E
-                  f  ----------->
-                  g         <-------------
-                            g.E           g.B
-                """
-                if f_b == 0 or g_e == 0:
-                    continue
-                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
-                                                           length = abs(f_b -0),
-                                                           score = -score)
-                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_e, 0),
-                                                           length = abs(g_e- 0),
-                                                           score = -score)
-        else:
-            if g_b < g_e:
-                """
-                                    f.B         f.E
-                  f                 ----------->
-                  g         ------------->
-                            g.B           g.E
-                """
-                if g_b == 0 or f_e - f_l == 0:
-                    continue
-                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_b, 0),
-                                                           length = abs(g_b - 0),
-                                                           score = -score)
-                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
-                                                           length = abs(f_e-f_l),
-                                                           score = -score)
-            else:
-                """
-                                    f.B         f.E
-                  f                 ----------->
-                  g         <-------------
-                            g.E           g.B
-                """
-                if g_b - g_l == 0 or f_e - f_l ==0:
-                    continue
-                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_b, g_l),
-                                                           length = abs(g_b - g_l),
-                                                           score = -score)
-                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
-                                                           length = abs(f_e - f_l),
-                                                           score = -score)
-
-
-    sg.init_reduce_dict()
-    #if not args.disable_chimer_prediction:
-    #    sg.mark_chimer_edge()
-    sg.mark_spur_edge()
-    sg.mark_tr_edges() # mark those edges that transitive redundant
-
-    #if DEBUG_LOG_LEVEL > 1:
-    if 1:
-        print sum( [1 for c in sg.e_reduce.values() if c == True] )
-        print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-    sg.mark_best_overlap() # mark those edges that are best overlap edges
-
-    if DEBUG_LOG_LEVEL > 1:
-        print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-
-    G = SGToNXG(sg)
-    nx.write_gexf(G, "string_graph.gexf") # output the raw string string graph for visuliation
-    nx.write_adjlist(G, "string_graph.adj") # write out the whole adjacent list of the string graph
-
-    u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa") # reduct to string graph into unitig graph
-    ASM_graph = get_bundles(u_edges )  # get the assembly
-    nx.write_gexf(ASM_graph, "asm_graph.gexf")
diff --git a/FALCON/src/py_scripts_v0.1/falcon_dedup.py b/FALCON/src/py_scripts_v0.1/falcon_dedup.py
deleted file mode 100644
index b574fad..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_dedup.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import subprocess
-from pbcore.io import FastaReader
-
-def get_matches(seq0, seq1):
-    with open("tmp_seq0.fa","w") as f:
-        print >>f, ">seq0"
-        print >>f, seq0
-    with open("tmp_seq1.fa","w") as f:
-        print >>f, ">seq1"
-        print >>f, seq1
-    mgaps_out=subprocess.check_output("mummer -maxmatch -c -b -l 24 tmp_seq0.fa tmp_seq1.fa | mgaps ", stderr = open("/dev/null"), shell=True)
-
-    matches = []
-    cluster = []
-    for l in mgaps_out.split("\n"):
-        l = l.strip().split()
-        if len(l) == 0:
-            continue
-        if l[0] == ">":
-            seq_id = l[1]
-
-            if len(cluster) != 0:
-                matches.append(cluster)
-
-            cluster = []
-            continue
-        if l[0] == "#":
-            if len(cluster) != 0:
-                matches.append(cluster)
-            cluster = []
-            continue
-        len_ = int(l[2])
-        r_s = int(l[0])
-        q_s = int(l[1])
-        r_e = r_s + len_
-        q_e = q_s + len_
-        cluster.append( ((r_s, r_e), (q_s, q_e)) )
-    if len(cluster) != 0:
-        matches.append(cluster)
-    return matches
-
-
-u_edges = {}
-with open("./unit_edges.dat") as f:
-    for l in f:
-        v, w, path, seq = l.strip().split()
-        u_edges.setdefault( (v, w), [] )
-        u_edges[ (v, w) ].append( (path, seq) )
-
-
-p_tig_path = {}
-a_tig_path = {}
-with open("primary_tigs_paths_c") as f:
-    for l in f:
-        l = l.strip().split()
-        id_ = l[0][1:]
-        path = l[1:]
-        p_tig_path[id_] = path
-
-with open("all_tigs_paths") as f:
-    for l in f:
-        l = l.strip().split()
-        id_ = l[0][1:]
-        path = l[1:]
-        a_tig_path[id_] = path
-
-p_tig_seqs = {}
-for r in FastaReader("primary_tigs_c.fa"):
-    p_tig_seqs[r.name] = r.sequence
-
-a_tig_seqs = {}
-for r in FastaReader("all_tigs.fa"):
-    a_tig_seqs[r.name.split()[0]] = r.sequence
-
-p_tig_to_node_pos = {}
-node_pos = []
-with open("primary_tigs_node_pos_c") as f:
-    for l in f:
-        l = l.strip().split()
-        p_tig_to_node_pos.setdefault( l[0], [])
-        p_tig_to_node_pos[l[0]].append( (l[1], int(l[2])))
-
-duplicate_a_tigs = []
-with open("a_nodup.fa","w") as out_f:
-    for p_tig_id in p_tig_path:
-        main_path = p_tig_path[p_tig_id]
-        main_path_nodes = set(main_path[:])
-        p_tig_seq = p_tig_seqs[p_tig_id]
-        a_node = []
-        a_node_range = []
-        a_node_range_map = {}
-        node_to_pos = dict( p_tig_to_node_pos[p_tig_id] )
-        for id_ in a_tig_path:
-            if id_[:4] != p_tig_id[:4]:
-                continue
-            if id_.split("-")[1] == "0000":
-                continue
-
-            a_path = a_tig_path[id_]
-            if a_path[0] in main_path_nodes and a_path[-1] in main_path_nodes:
-                #print p_tig_id, id_, a_path[0], a_path[-1]
-                s, e = node_to_pos[a_path[0]], node_to_pos[a_path[-1]]
-                p_seq = p_tig_seq[s:e]
-                a_seq = a_tig_seqs[id_]
-                seq_match = get_matches(p_seq, a_seq)
-                if len(seq_match) > 1:
-                    print >>out_f, ">"+id_
-                    print >>out_f,  a_seq
-                    continue
-                try:
-                    r_s, r_e = seq_match[0][0][0][0], seq_match[0][-1][0][1]
-                except:
-                    print "XXX", seq_match
-                if 1.0* (r_e - r_s) / (e - s) > 98:
-                    print >>out_f, ">"+id_
-                    print >>out_f, a_seq
-                    continue
-                duplicate_a_tigs.append(id_)
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_fixasm.py b/FALCON/src/py_scripts_v0.1/falcon_fixasm.py
deleted file mode 100644
index 525d5be..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_fixasm.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import networkx as nx
-from pbcore.io import FastaReader
-
-def neighbor_bound(G, v, w, radius):
-    g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
-    g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
-    if len(g1) < radius or len(g2) < radius:
-        return True
-    print v, len(g1), w, len(g2), radius
-    if len(set(g1.edges()) & set(g2.edges())) > 0:
-        return True
-    else:
-        return False
-
-def is_branch_node(G, n):
-    out_edges = G.out_edges([n])
-    n2 = [ e[1] for e in out_edges ]
-    is_branch = False
-    for i in range(len(n2)):
-        for j in range(i+1, len(n2)):
-            v = n2[i]
-            w = n2[j]
-            if neighbor_bound(G, v, w, 20) == False:
-                is_branch = True
-                break
-        if is_branch == True:
-            break
-    return is_branch
-
-
-def get_r_path(r_edges, u_path):
-    tiling_path = []
-    pos = 0
-
-    for i in range( len(u_path) - 1):
-        v, w = u_path[i:i+2]
-        r_edge_label, overlap = r_edges[ (v, w) ]
-        r_edge_seq_id, range_ = r_edge_label.split(":")
-        range_ = range_.split("-")
-        s, e = int(range_[0]), int(range_[1])
-        pos += abs(e-s)
-        tiling_path.append( (pos, w, s, e) )
-    return tiling_path
-
-def get_seq(u_edges, r_edges, path):
-    subseqs = []
-    pos = []
-    cur_pos = 0
-    full_tiling_path = []
-
-    for i in range( len(path) - 1):
-        v, w = path[i:i+2]
-        pos.append( (v, cur_pos) )
-        uedges = u_edges[ (v, w) ]
-        uedges.sort( key= lambda x: len(x[0]) )
-        subseqs.append( uedges[-1][1] )
-        r_path = get_r_path( r_edges, uedges[-1][0].split("-") )
-        r_path = [ ( x[0] + cur_pos, x[1], x[2], x[3]) for x in r_path ]
-        full_tiling_path.extend( r_path )
-        cur_pos += len( uedges[-1][1] )
-    pos.append( (w, cur_pos) )
-    return "".join(subseqs), pos, full_tiling_path
-
-
-u_edges = {}
-with open("unit_edges.dat") as f:
-    for l in f:
-        v, w, path, seq = l.strip().split()
-        u_edges.setdefault( (v, w), [] )
-        u_edges[ (v, w) ].append( (path, seq) )
-len(u_edges)
-
-
-r_edges = {}
-with open("edges_list") as f:
-    for l in f:
-        v, w, edge_label, overlap = l.strip().split()
-        r_edges[ (v, w) ] = (edge_label, int(overlap) )
-
-
-primary_tigs_path = {}
-primary_path_graph = nx.DiGraph()
-begin_nodes = {}
-end_nodes ={}
-with open("primary_tigs_paths") as f:
-    for l in f:
-        l = l.strip().split()
-        name = l[0][1:]
-        path = l[1:]
-        primary_tigs_path[name] = path
-        if len(path) < 3:
-            continue
-        for i in range(len(path)-1):
-            n1 = path[i].split(":")[0]
-            n2 = path[i+1].split(":")[0]
-            primary_path_graph.add_edge( n1, n2)
-        begin_nodes.setdefault(path[0], [])
-        begin_nodes[path[0]].append( name )
-        end_nodes.setdefault(path[-1], [])
-        end_nodes[path[-1]].append( name )
-
-
-
-path_names = primary_tigs_path.keys()
-path_names.sort()
-primary_path_graph_r = primary_path_graph.reverse()
-path_f = open("primary_tigs_paths_c","w")
-pos_f = open("primary_tigs_node_pos_c", "w")
-tiling_path_f = open("all_tiling_path_c", "w")
-with open("primary_tigs_c.fa","w") as out_f:
-    for name in path_names:
-        sub_idx = 0
-        c_path = [ primary_tigs_path[name][0] ]
-        for v in primary_tigs_path[name][1:]:
-            break_path = False
-
-            vn = v.split(":")[0]
-
-            if primary_path_graph.out_degree(vn) > 1:
-                break_path = is_branch_node(primary_path_graph, vn)
-            if primary_path_graph.in_degree(vn) > 1:
-                break_path = is_branch_node(primary_path_graph_r, vn)
-            if break_path:
-                c_path.append(v)
-                seq, pos, full_tiling_path = get_seq(u_edges, r_edges, c_path)
-                for p, w, s, e in full_tiling_path:
-                    print >> tiling_path_f, "%s_%02d" % (name, sub_idx), p, w, s, e
-                #if len(full_tiling_path) <= 5:
-                #    continue
-                print >>out_f, ">%s_%02d" % (name, sub_idx)
-                print >>out_f, seq
-                print >>path_f, ">%s_%02d" % (name, sub_idx), " ".join(c_path)
-                #print c_path
-                for node, p in pos:
-                    print >> pos_f, "%s_%02d %s %d" % (name, sub_idx, node, p)
-                c_path = [v]
-                sub_idx += 1
-            else:
-                c_path.append(v)
-
-        if len(c_path) > 1:
-            seq, pos, full_tiling_path = get_seq(u_edges, r_edges, c_path)
-            for p, w, s, e in full_tiling_path:
-                print >> tiling_path_f, "%s_%02d" % (name, sub_idx), p, w, s, e
-            if len(full_tiling_path) <= 5:
-                continue
-            print >>out_f, ">%s_%02d" % (name, sub_idx)
-            print >>out_f, seq
-            print >>path_f, ">%s_%02d" % (name, sub_idx), " ".join(c_path)
-            for node, p in pos:
-                print >> pos_f, "%s_%02d %s %d" % (name, sub_idx, node, p)
-
-with open("all_tigs_paths") as f:
-    for l in f:
-        l = l.strip().split()
-        name = l[0][1:]
-        name = name.split("-")
-        if name[1] == "0000":
-            continue
-        if len(name) == 2:
-            path = l[1:]
-            seq, pos, full_tiling_path = get_seq(u_edges, r_edges, path)
-            for p, w, s, e in full_tiling_path:
-                print >> tiling_path_f, "%s" % ("-".join(name)), p, w, s, e
-        else:
-            path = l[1:]
-            full_tiling_path = get_r_path(r_edges, path)
-            for p, w, s, e in full_tiling_path:
-                print >> tiling_path_f, "%s" % ("-".join(name)), p, w, s, e
-
-
-path_f.close()
-tiling_path_f.close()
-pos_f.close()
diff --git a/FALCON/src/py_scripts_v0.1/falcon_overlap.py b/FALCON/src/py_scripts_v0.1/falcon_overlap.py
deleted file mode 100755
index 1ab5a99..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_overlap.py
+++ /dev/null
@@ -1,328 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs, seqs
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-def get_ovelap_alignment(seq1, seq0):
-
-    K = 8
-    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-    sa_ptr = kup.allocate_seq( len(seq0) )
-    sda_ptr = kup.allocate_seq_addr( len(seq0) )
-    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
-    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
-    aln_range = aln_range_ptr[0]
-    kup.free_kmer_match(kmer_match_ptr)
-    s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
-    e1 += K + K/2
-    e0 += K + K/2
-    kup.free_aln_range(aln_range)
-    len_1 = len(seq1)
-    len_0 = len(seq0)
-    if e1 > len_1:
-        e1 = len_1
-    if e0 > len_0:
-        e0 = len_0
-    do_aln = False
-    contain_status = "none"
-    #print s0, e0, s1, e1
-    if e1 - s1 > 500:
-        if s0 < s1 and s0 > 24:
-            do_aln = False
-        elif s1 <= s0 and s1 > 24:
-            do_aln = False
-        elif s1 < 24 and len_1 - e1 < 24:
-            do_aln = True
-            contain_status = "contains"
-            #print "X1"
-        elif s0 < 24 and len_0 - e0 < 24:
-            do_aln = True
-            contain_status = "contained"
-            #print "X2"
-        else:
-            do_aln = True
-            if s0 < s1:
-                s1 -= s0 #assert s1 > 0
-                s0 = 0
-                e1 = len_1
-                #if len_1 - s1 >= len_0:
-                #    do_aln = False
-                #    contain_status = "contains"
-                #    print "X3", s0, e0, len_0, s1, e1, len_1
-
-
-            elif s1 <= s0:
-                s0 -= s1 #assert s1 > 0
-                s1 = 0
-                e0 = len_0
-                #print s0, e0, s1, e1
-                #if len_0 - s0 >= len_1:
-                #    do_aln = False
-                #    contain_status = "contained"
-                #    print "X4"
-        #if abs( (e1 - s1) - (e0 - s0 ) ) > 200:  #avoid overlap alignment for big indels
-        #    do_aln = False
-
-        if do_aln:
-            alignment = DWA.align(seq1[s1:e1], e1-s1,
-                                  seq0[s0:e0], e0-s0,
-                                  500, 0)
-            #print seq1[s1:e1]
-            #print seq0[s2:e2]
-            #if alignment[0].aln_str_size > 500:
-
-            #aln_str1 = alignment[0].q_aln_str
-            #aln_str0 = alignment[0].t_aln_str
-            aln_size = alignment[0].aln_str_size
-            aln_dist = alignment[0].dist
-            aln_q_s = alignment[0].aln_q_s
-            aln_q_e = alignment[0].aln_q_e
-            aln_t_s = alignment[0].aln_t_s
-            aln_t_e = alignment[0].aln_t_e
-            assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
-            #print aln_str1
-            #print aln_str0
-            if aln_size > 500 and contain_status == "none":
-                contain_status = "overlap"
-            DWA.free_alignment(alignment)
-
-    kup.free_seq_addr_array(sda_ptr)
-    kup.free_seq_array(sa_ptr)
-    kup.free_kmer_lookup(lk_ptr)
-
-    if do_aln:
-        if s1 > 1000 and s0 > 1000:
-            return 0, 0, 0, 0, 0, 0, "none"
-        if len_1 - (s1+aln_q_e-aln_q_s) > 1000 and len_0 - (s0+aln_t_e-aln_t_s) > 1000:
-            return 0, 0, 0, 0, 0, 0, "none"
-
-
-
-
-    if e1 - s1 > 500 and do_aln and aln_size > 500:
-        #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
-        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
-    else:
-        return 0, 0, 0, 0, 0, 0, contain_status
-
-def get_candidate_aln(hit_input):
-
-    global q_seqs
-    q_name, hit_index_f, hit_index_r = hit_input
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-
-    hit_index = hit_index_f
-    c = collections.Counter(hit_index)
-    s = [c[0] for c in c.items() if c[1] >50]
-    #s.sort()
-    targets = set()
-    for p in s:
-        hit_id = seqs[p][0]
-        if hit_id in targets or hit_id == q_name:
-            continue
-        targets.add(hit_id)
-        seq1, seq0 = q_seq, q_seqs[hit_id]
-        aln_data = get_ovelap_alignment(seq1, seq0)
-        #rtn = get_alignment(seq1, seq0)
-        if rtn != None:
-
-            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
-            #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
-            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
-                          0, s2, e2, len(seq0),
-                          0, s1, e1, len(seq1), c_status ) )
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    hit_index = hit_index_r
-    c = collections.Counter(hit_index)
-    s = [c[0] for c in c.items() if c[1] >50]
-    #s.sort()
-    targets = set()
-    for p in s:
-        hit_id = seqs[p][0]
-        if hit_id in targets or hit_id == q_name:
-            continue
-        targets.add(hit_id)
-        seq1, seq0 = r_q_seq, q_seqs[hit_id]
-        aln_data = get_ovelap_alignment(seq1, seq0)
-        #rtn = get_alignment(seq1, seq0)
-        if rtn != None:
-            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
-            #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
-            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
-                          0, s2, e2, len(seq0),
-                          1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status ) )
-
-    return rtn
-
-def build_look_up(seqs, K):
-    global sa_ptr, sda_ptr, lk_ptr
-
-    total_index_base = len(seqs) * 1000
-    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    kup.init_seq_array(c_sa_ptr, total_index_base)
-
-    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
-    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
-    start = 0
-    for r_name, seq in seqs:
-        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
-        start += 1000
-
-    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 512)
-
-    #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
-    global sa_ptr, sda_ptr, lk_ptr
-    global q_seqs
-
-    K = 14
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-    return  q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
-    for q_name, q_seq in q_seqs.items():
-        yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
-    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
-        yield mr
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
-    parser.add_argument('fasta_file', help='a fasta file for all pairwise overlapping of the reads within')
-    parser.add_argument('--min_len', type=int, default=4000,
-                        help='minimum length of the reads to be considered for overlapping')
-    parser.add_argument('--n_core', type=int, default=1,
-                        help='number of processes used for detailed overlapping evalution')
-    parser.add_argument('--d_core', type=int, default=1,
-                        help='number of processes used for k-mer matching')
-
-
-    args = parser.parse_args()
-
-    seqs = []
-    q_seqs = {}
-    f = FastaReader(args.fasta_file) # take one commnad line argument of the input fasta file name
-
-    if  args.min_len < 2200:
-         args.min_len = 2200
-
-    idx = 0
-    for r in f:
-        if len(r.sequence) < args.min_len:
-            continue
-        seq = r.sequence.upper()
-        for start in range(0, len(seq), 1000):
-            if start+1000 > len(seq):
-                break
-            seqs.append( (r.name, seq[start: start+1000]) )
-            idx += 1
-
-        #seqs.append( (r.name, seq[:1000]) )
-        seqs.append( (r.name, seq[-1000:]) )
-        idx += 1
-
-        q_seqs[r.name] = seq
-
-
-    total_index_base = len(seqs) * 1000
-    pool = mp.Pool(args.n_core)
-    K = 14
-    build_look_up(seqs, K)
-    m_pool = mp.Pool(args.d_core)
-
-
-    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
-    for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
-        for h in r:
-            print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_overlap2.py b/FALCON/src/py_scripts_v0.1/falcon_overlap2.py
deleted file mode 100755
index a8f632c..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_overlap2.py
+++ /dev/null
@@ -1,337 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs,t_seqs, seqs
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-def get_ovelap_alignment(seq1, seq0):
-
-    K = 8
-    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-    sa_ptr = kup.allocate_seq( len(seq0) )
-    sda_ptr = kup.allocate_seq_addr( len(seq0) )
-    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
-    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
-    aln_range = aln_range_ptr[0]
-    kup.free_kmer_match(kmer_match_ptr)
-    s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
-    e1 += K + K/2
-    e0 += K + K/2
-    kup.free_aln_range(aln_range)
-    len_1 = len(seq1)
-    len_0 = len(seq0)
-    if e1 > len_1:
-        e1 = len_1
-    if e0 > len_0:
-        e0 = len_0
-    do_aln = False
-    contain_status = "none"
-    #print s0, e0, s1, e1
-    if e1 - s1 > 500:
-        if s0 < s1 and s0 > 24:
-            do_aln = False
-        elif s1 <= s0 and s1 > 24:
-            do_aln = False
-        elif s1 < 24 and len_1 - e1 < 24:
-            do_aln = True
-            contain_status = "contains"
-            #print "X1"
-        elif s0 < 24 and len_0 - e0 < 24:
-            do_aln = True
-            contain_status = "contained"
-            #print "X2"
-        else:
-            do_aln = True
-            if s0 < s1:
-                s1 -= s0 #assert s1 > 0
-                s0 = 0
-                e1 = len_1
-                #if len_1 - s1 >= len_0:
-                #    do_aln = False
-                #    contain_status = "contains"
-                #    print "X3", s0, e0, len_0, s1, e1, len_1
-
-
-            elif s1 <= s0:
-                s0 -= s1 #assert s1 > 0
-                s1 = 0
-                e0 = len_0
-                #print s0, e0, s1, e1
-                #if len_0 - s0 >= len_1:
-                #    do_aln = False
-                #    contain_status = "contained"
-                #    print "X4"
-        #if abs( (e1 - s1) - (e0 - s0 ) ) > 200:  #avoid overlap alignment for big indels
-        #    do_aln = False
-
-        if do_aln:
-            alignment = DWA.align(seq1[s1:e1], e1-s1,
-                                  seq0[s0:e0], e0-s0,
-                                  500, 0)
-            #print seq1[s1:e1]
-            #print seq0[s2:e2]
-            #if alignment[0].aln_str_size > 500:
-
-            #aln_str1 = alignment[0].q_aln_str
-            #aln_str0 = alignment[0].t_aln_str
-            aln_size = alignment[0].aln_str_size
-            aln_dist = alignment[0].dist
-            aln_q_s = alignment[0].aln_q_s
-            aln_q_e = alignment[0].aln_q_e
-            aln_t_s = alignment[0].aln_t_s
-            aln_t_e = alignment[0].aln_t_e
-            assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
-            #print aln_str1
-            #print aln_str0
-            if aln_size > 500 and contain_status == "none":
-                contain_status = "overlap"
-            DWA.free_alignment(alignment)
-
-    kup.free_seq_addr_array(sda_ptr)
-    kup.free_seq_array(sa_ptr)
-    kup.free_kmer_lookup(lk_ptr)
-
-    if do_aln:
-        if s1 > 1000 and s0 > 1000:
-            return 0, 0, 0, 0, 0, 0, "none"
-        if len_1 - (s1+aln_q_e-aln_q_s) > 1000 and len_0 - (s0+aln_t_e-aln_t_s) > 1000:
-            return 0, 0, 0, 0, 0, 0, "none"
-
-    if e1 - s1 > 500 and do_aln and aln_size > 500:
-        #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
-        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
-    else:
-        return 0, 0, 0, 0, 0, 0, contain_status
-
-def get_candidate_aln(hit_input):
-
-    global q_seqs, seqs, t_seqs
-    q_name, hit_index_f, hit_index_r = hit_input
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-
-    hit_index = hit_index_f
-    c = collections.Counter(hit_index)
-    s = [c[0] for c in c.items() if c[1] >50]
-    #s.sort()
-    targets = set()
-    for p in s:
-        hit_id = seqs[p][0]
-        if hit_id in targets or hit_id == q_name:
-            continue
-        targets.add(hit_id)
-        seq1, seq0 = q_seq, t_seqs[hit_id]
-        aln_data = get_ovelap_alignment(seq1, seq0)
-        #rtn = get_alignment(seq1, seq0)
-        if rtn != None:
-
-            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
-            if c_status == "none":
-                continue
-            #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
-            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
-                          0, s2, e2, len(seq0),
-                          0, s1, e1, len(seq1), c_status ) )
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    hit_index = hit_index_r
-    c = collections.Counter(hit_index)
-    s = [c[0] for c in c.items() if c[1] >50]
-    #s.sort()
-    targets = set()
-    for p in s:
-        hit_id = seqs[p][0]
-        if hit_id in targets or hit_id == q_name:
-            continue
-        targets.add(hit_id)
-        seq1, seq0 = r_q_seq, t_seqs[hit_id]
-        aln_data = get_ovelap_alignment(seq1, seq0)
-        #rtn = get_alignment(seq1, seq0)
-        if rtn != None:
-            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
-            if c_status == "none":
-                continue
-            #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
-            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
-                          0, s2, e2, len(seq0),
-                          1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status ) )
-
-    return rtn
-
-def build_look_up(seqs, K):
-    global sa_ptr, sda_ptr, lk_ptr
-
-    total_index_base = len(seqs) * 1000
-    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    kup.init_seq_array(c_sa_ptr, total_index_base)
-
-    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
-    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
-    start = 0
-    for r_name, seq in seqs:
-        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
-        start += 1000
-
-    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 256)
-
-    #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
-    global sa_ptr, sda_ptr, lk_ptr
-    global q_seqs
-
-    K = 14
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-    return  q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
-    for q_name, q_seq in q_seqs.items():
-        yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
-    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
-        yield mr
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
-    parser.add_argument('query_fa', help='a fasta file to be overlapped with sequence in target')
-    parser.add_argument('target_fa', help='a fasta file as the target sequences for overlapping')
-    parser.add_argument('--min_len', type=int, default=4000,
-                        help='minimum length of the reads to be considered for overlapping')
-    parser.add_argument('--n_core', type=int, default=1,
-                        help='number of processes used for detailed overlapping evalution')
-    parser.add_argument('--d_core', type=int, default=1,
-                        help='number of processes used for k-mer matching')
-
-
-    args = parser.parse_args()
-
-    seqs = []
-    q_seqs = {}
-    t_seqs = {}
-    f = FastaReader(args.target_fa) # take one commnad line argument of the input fasta file name
-
-    if  args.min_len < 2200:
-         args.min_len = 2200
-
-    idx = 0
-    for r in f:
-        if len(r.sequence) < args.min_len:
-            continue
-        seq = r.sequence.upper()
-        for start in range(0, len(seq), 1000):
-            if start+1000 > len(seq):
-                break
-            seqs.append( (r.name, seq[start: start+1000]) )
-            idx += 1
-
-        seqs.append( (r.name, seq[-1000:]) )
-        idx += 1
-
-        t_seqs[r.name] = seq
-
-    f = FastaReader(args.query_fa) # take one commnad line argument of the input fasta file name
-    for r in f:
-        if len(r.sequence) < args.min_len:
-            continue
-        seq = r.sequence.upper()
-        q_seqs[r.name] = seq
-
-
-    total_index_base = len(seqs) * 1000
-    pool = mp.Pool(args.n_core)
-    K = 14
-    build_look_up(seqs, K)
-    m_pool = mp.Pool(args.d_core)
-
-
-    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
-    for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
-        for h in r:
-            print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_qrm.py b/FALCON/src/py_scripts_v0.1/falcon_qrm.py
deleted file mode 100755
index 805fcc6..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_qrm.py
+++ /dev/null
@@ -1,370 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-import math
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs,t_seqs, seqs
-global n_candidates, max_candidates
-
-seqs = []
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-all_fivemers = []
-cmap = {0:"A", 1:"T", 2:"C", 3:"G"}
-for i in range(1024):
-    mer = []
-    for j in range(5):
-        mer.append( cmap[ i >> (2 *j) & 3 ])
-    all_fivemers.append("".join(mer))
-
-def fivemer_entropy(seq):
-    five_mer_count = {}
-
-    for i in range(len(seq)-5):
-        five_mer = seq[i:i+5]
-        five_mer_count.setdefault(five_mer, 0)
-        five_mer_count[five_mer] += 1
-
-    entropy = 0.0
-    for five_mer in all_fivemers:
-        p = five_mer_count.get(five_mer, 0) + 1.0
-        p /= len(seq)
-        entropy += - p * math.log(p)
-
-    return entropy
-
-def get_alignment(seq1, seq0):
-
-    K = 8
-    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-    sa_ptr = kup.allocate_seq( len(seq0) )
-    sda_ptr = kup.allocate_seq_addr( len(seq0) )
-    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
-    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
-    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
-    aln_range = aln_range_ptr[0]
-    kup.free_kmer_match(kmer_match_ptr)
-    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score
-    e1 += K + K/2
-    e0 += K + K/2
-    kup.free_aln_range(aln_range)
-    len_1 = len(seq1)
-    len_0 = len(seq0)
-    if e1 > len_1:
-        e1 = len_1
-    if e0 > len_0:
-        e0 = len_0
-
-    aln_size = 1
-    if e1 - s1 > 500:
-
-        aln_size = max( e1-s1, e0-s0 )
-        aln_score = int(km_score * 48)
-        aln_q_s = s1
-        aln_q_e = e1
-        aln_t_s = s0
-        aln_t_e = e0
-
-    kup.free_seq_addr_array(sda_ptr)
-    kup.free_seq_array(sa_ptr)
-    kup.free_kmer_lookup(lk_ptr)
-
-    if s1 > 1000 and s0 > 1000:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-    if len_1 - e1 > 1000 and len_0 - e0 > 1000:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-
-    if e1 - s1 > 500 and aln_size > 500:
-        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
-    else:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-def get_candidate_aln(hit_input):
-
-    global q_seqs, seqs, t_seqs, q_len
-    global max_candidates
-    global n_candidates
-    q_name, hit_index_f, hit_index_r = hit_input
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-    hit_index = hit_index_f
-    c = collections.Counter(hit_index)
-    s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
-
-    hit_data = {}
-    #hit_ids = set()
-
-    for p, hit_count in s:
-        hit_id = seqs[p][0]
-        hit_data.setdefault(hit_id, [0, 0 ,0])
-        hit_data[hit_id][0] += hit_count;
-        if hit_count > hit_data[hit_id][1]:
-            hit_data[hit_id][1] = hit_count
-        hit_data[hit_id][2] += 1
-
-    hit_data = hit_data.items()
-
-    hit_data.sort( key=lambda x:-x[1][0] )
-
-    target_count = {}
-    total_hit = 0
-
-    for hit in hit_data[:n_candidates]:
-        hit_id = hit[0]
-        hit_count = hit[1][0]
-        target_count.setdefault(hit_id, 0)
-        if target_count[hit_id] > max_candidates:
-            continue
-        if total_hit > max_candidates:
-            continue
-        seq1, seq0 = q_seq, t_seqs[hit_id]
-        aln_data = get_alignment(seq1, seq0)
-        if rtn != None:
-
-            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
-            if c_status == "none":
-                continue
-            target_count[hit_id] += 1
-            total_hit += 1
-            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
-                          0, s1, e1, len(seq1),
-                          0, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    hit_index = hit_index_r
-    c = collections.Counter(hit_index)
-    s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
-
-    hit_data = {}
-    #hit_ids = set()
-
-    for p, hit_count in s:
-        hit_id = seqs[p][0]
-        hit_data.setdefault(hit_id, [0, 0 ,0])
-        hit_data[hit_id][0] += hit_count;
-        if hit_count > hit_data[hit_id][1]:
-            hit_data[hit_id][1] = hit_count
-        hit_data[hit_id][2] += 1
-
-    hit_data = hit_data.items()
-
-    hit_data.sort( key=lambda x:-x[1][0] )
-
-
-    target_count = {}
-    total_hit = 0
-
-    for hit in hit_data[:n_candidates]:
-        hit_id = hit[0]
-        hit_count = hit[1][0]
-        target_count.setdefault(hit_id, 0)
-        if target_count[hit_id] > max_candidates:
-            continue
-        if total_hit > max_candidates:
-            continue
-        seq1, seq0 = r_q_seq, t_seqs[hit_id]
-        aln_data = get_alignment(seq1, seq0)
-        if rtn != None:
-            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
-            if c_status == "none":
-                continue
-            target_count[hit_id] += 1
-            total_hit += 1
-            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
-                          0, len(seq1) - e1, len(seq1) - s1, len(seq1),
-                          1, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
-    return rtn
-
-def build_look_up(seqs, K):
-    global sa_ptr, sda_ptr, lk_ptr
-
-    total_index_base = len(seqs) * 1000
-    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    kup.init_seq_array(c_sa_ptr, total_index_base)
-
-    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
-    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
-    start = 0
-    for r_name, seq in seqs:
-        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
-        start += 1000
-
-    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 1024)
-
-    #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
-    global sa_ptr, sda_ptr, lk_ptr
-    global q_seqs
-
-    K = 14
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-    return  q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
-    for q_name, q_seq in q_seqs.items():
-        yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
-    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
-        yield mr
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
-    parser.add_argument('target_fofn', help='a fasta fofn as the target sequences for overlapping')
-    parser.add_argument('query_fofn', help='a fasta fofn  to be overlapped with sequence in target')
-    parser.add_argument('--min_len', type=int, default=4000,
-                        help='minimum length of the reads to be considered for overlapping')
-    parser.add_argument('--n_core', type=int, default=1,
-                        help='number of processes used for detailed overlapping evalution')
-    parser.add_argument('--d_core', type=int, default=1,
-                        help='number of processes used for k-mer matching')
-    parser.add_argument('--n_candidates', type=int, default=128,
-                        help='number of candidates for read matching')
-    parser.add_argument('--max_candidates', type=int, default=64,
-                        help='max number for read matching to output')
-
-
-
-    args = parser.parse_args()
-
-    max_candidates = args.max_candidates
-    n_candidates = args.n_candidates
-
-    q_seqs = {}
-    t_seqs = {}
-    if  args.min_len < 1200:
-         args.min_len = 1200
-
-    with open(args.target_fofn) as fofn:
-        for fn in fofn:
-            fn = fn.strip()
-            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
-            for r in f:
-                if len(r.sequence) < args.min_len:
-                    continue
-                seq = r.sequence.upper()
-                for start in range(0, len(seq), 1000):
-                    if start+1000 > len(seq):
-                        break
-                    subseq = seq[start: start+1000]
-                    #if fivemer_entropy(subseq) < 4:
-                    #    continue
-                    seqs.append( (r.name, subseq) )
-                subseq = seq[-1000:]
-                #if fivemer_entropy(subseq) < 4:
-                #    continue
-                #seqs.append( (r.name, seq[:1000]) )
-                seqs.append( (r.name, subseq) )
-
-                t_seqs[r.name] = seq
-
-    with open(args.query_fofn) as fofn:
-        for fn in fofn:
-            fn = fn.strip()
-            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
-            for r in f:
-                seq = r.sequence.upper()
-                #if fivemer_entropy(seq) < 4:
-                #    continue
-                q_seqs[r.name] = seq
-
-
-    pool = mp.Pool(args.n_core)
-    K = 14
-    build_look_up(seqs, K)
-    m_pool = mp.Pool(args.d_core)
-
-
-    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
-    for r in pool.imap(get_candidate_aln, lookup_data_iterator(q_seqs, m_pool)):
-        for h in r:
-            print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_qrm_0.py b/FALCON/src/py_scripts_v0.1/falcon_qrm_0.py
deleted file mode 100755
index 2cb6e77..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_qrm_0.py
+++ /dev/null
@@ -1,378 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-import math
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs,t_seqs, seqs
-
-seqs = []
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-all_fivemers = []
-cmap = {0:"A", 1:"T", 2:"C", 3:"G"}
-for i in range(1024):
-    mer = []
-    for j in range(5):
-        mer.append( cmap[ i >> (2 *j) & 3 ])
-    all_fivemers.append("".join(mer))
-
-def fivemer_entropy(seq):
-    five_mer_count = {}
-
-    for i in range(len(seq)-5):
-        five_mer = seq[i:i+5]
-        five_mer_count.setdefault(five_mer, 0)
-        five_mer_count[five_mer] += 1
-
-    entropy = 0.0
-    for five_mer in all_fivemers:
-        p = five_mer_count.get(five_mer, 0) + 1.0
-        p /= len(seq)
-        entropy += - p * math.log(p)
-
-    return entropy
-
-def get_alignment(seq1, seq0):
-
-    K = 8
-    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-    sa_ptr = kup.allocate_seq( len(seq0) )
-    sda_ptr = kup.allocate_seq_addr( len(seq0) )
-    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
-    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
-    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
-    aln_range = aln_range_ptr[0]
-    kup.free_kmer_match(kmer_match_ptr)
-    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score
-    e1 += K + K/2
-    e0 += K + K/2
-    kup.free_aln_range(aln_range)
-    len_1 = len(seq1)
-    len_0 = len(seq0)
-    if e1 > len_1:
-        e1 = len_1
-    if e0 > len_0:
-        e0 = len_0
-
-    aln_size = 1
-    if e1 - s1 > 500:
-
-        #aln_size = max( e1-s1, e0-s0 )
-        #aln_score = int(km_score * 2)
-        #aln_q_s = s1
-        #aln_q_e = e1
-        #aln_t_s = s0
-        #aln_t_e = e0
-
-        alignment = DWA.align(seq1[s1:e1], e1-s1,
-                              seq0[s0:e0], e0-s0,
-                              500, 0)
-        aln_size = alignment[0].aln_str_size
-        aln_score = 4 * alignment[0].aln_str_size - 5 * alignment[0].dist
-        aln_q_s = alignment[0].aln_q_s
-        aln_q_e = alignment[0].aln_q_e
-        aln_t_s = alignment[0].aln_t_s
-        aln_t_e = alignment[0].aln_t_e
-        assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
-        #print aln_str1
-        #print aln_str0
-
-        if aln_size > 500:
-            contain_status = "overlap"
-        DWA.free_alignment(alignment)
-
-    kup.free_seq_addr_array(sda_ptr)
-    kup.free_seq_array(sa_ptr)
-    kup.free_kmer_lookup(lk_ptr)
-
-    if e1 - s1 > 500 and aln_size > 500:
-        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
-    else:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-def get_candidate_aln(hit_input):
-
-    global q_seqs, seqs, t_seqs, q_len
-    q_name, hit_index_f, hit_index_r = hit_input
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-    hit_index = hit_index_f
-    c = collections.Counter(hit_index)
-    s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
-
-    hit_data = []
-    hit_ids = set()
-    for p, hit_count in s:
-        hit_id = seqs[p][0]
-        if hit_id == q_name or hit_id in hit_ids:
-            continue
-        if hit_id not in hit_ids:
-            hit_ids.add(hit_id)
-            hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
-
-    hit_data.sort( key=lambda x:-x[2] )
-
-    target_count = {}
-    total_hit = 0
-
-    for hit in hit_data:
-        hit_id = hit[0]
-        hit_count = hit[3]
-        target_count.setdefault(hit_id, 0)
-        if target_count[hit_id] > 64:
-            continue
-        if total_hit > 64:
-            continue
-        seq1, seq0 = q_seq, hit[1]
-        aln_data = get_alignment(seq1, seq0)
-        if rtn != None:
-
-            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
-            if c_status == "none":
-                continue
-            """
-            if e1 - s1 < 5000:
-                if -aln_score > -8000:
-                    continue
-                if (100.0*aln_score/(aln_size+1)) < 150:
-                    continue
-            """
-            target_count[hit_id] += 1
-            total_hit += 1
-            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
-                          0, s1, e1, len(seq1),
-                          0, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    hit_index = hit_index_r
-    c = collections.Counter(hit_index)
-    s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
-
-    hit_data = []
-    hit_ids = set()
-    for p, hit_count in s:
-        hit_id = seqs[p][0]
-        if hit_id == q_name or hit_id in hit_ids:
-            continue
-        if hit_id not in hit_ids:
-            hit_ids.add(hit_id)
-            hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
-
-    hit_data.sort( key=lambda x:-x[2] )
-
-    target_count = {}
-    total_hit = 0
-
-    for hit in hit_data:
-        hit_id = hit[0]
-        hit_count = hit[3]
-        target_count.setdefault(hit_id, 0)
-        if target_count[hit_id] > 64:
-            continue
-        if total_hit > 64:
-            continue
-        seq1, seq0 = r_q_seq, hit[1]
-        aln_data = get_alignment(seq1, seq0)
-        if rtn != None:
-            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
-            if c_status == "none":
-                continue
-            """
-            if e1 - s1 < 5000:
-                if -aln_score > -8000:
-                    continue
-                if (100.0*aln_score/(aln_size+1)) < 150:
-                    continue
-            """
-            target_count[hit_id] += 1
-            total_hit += 1
-            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
-                          0, len(seq1) - e1, len(seq1) - s1, len(seq1),
-                          1, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
-    return rtn
-
-def build_look_up(seqs, K):
-    global sa_ptr, sda_ptr, lk_ptr
-
-    total_index_base = len(seqs) * 1000
-    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    kup.init_seq_array(c_sa_ptr, total_index_base)
-
-    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
-    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
-    start = 0
-    for r_name, seq in seqs:
-        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
-        start += 1000
-
-    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 1024)
-
-    #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
-    global sa_ptr, sda_ptr, lk_ptr
-    global q_seqs
-
-    K = 14
-    q_seq = q_seqs[q_name]
-
-    rtn = []
-
-    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
-    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-
-    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    count = kmer_match.count
-    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
-    kup.free_kmer_match(kmer_match_ptr)
-    return  q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
-    for q_name, q_seq in q_seqs.items():
-        yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
-    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
-        yield mr
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
-    parser.add_argument('target_fofn', help='a fasta fofn as the target sequences for overlapping')
-    parser.add_argument('query_fofn', help='a fasta fofn  to be overlapped with sequence in target')
-    parser.add_argument('--min_len', type=int, default=4000,
-                        help='minimum length of the reads to be considered for overlapping')
-    parser.add_argument('--n_core', type=int, default=1,
-                        help='number of processes used for detailed overlapping evalution')
-    parser.add_argument('--d_core', type=int, default=1,
-                        help='number of processes used for k-mer matching')
-
-
-    args = parser.parse_args()
-
-    q_seqs = {}
-    t_seqs = {}
-    if  args.min_len < 1200:
-         args.min_len = 1200
-
-    with open(args.target_fofn) as fofn:
-        for fn in fofn:
-            fn = fn.strip()
-            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
-            for r in f:
-                if len(r.sequence) < args.min_len:
-                    continue
-                seq = r.sequence.upper()
-                for start in range(0, len(seq), 1000):
-                    if start+1000 > len(seq):
-                        break
-                    subseq = seq[start: start+1000]
-                    #if fivemer_entropy(subseq) < 4:
-                    #    continue
-                    seqs.append( (r.name, subseq) )
-                subseq = seq[-1000:]
-                #if fivemer_entropy(subseq) < 4:
-                #    continue
-                #seqs.append( (r.name, seq[:1000]) )
-                seqs.append( (r.name, subseq) )
-
-                t_seqs[r.name] = seq
-
-    with open(args.query_fofn) as fofn:
-        for fn in fofn:
-            fn = fn.strip()
-            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
-            for r in f:
-                #if len(r.sequence) < args.min_len:
-                #    continue
-                seq = r.sequence.upper()
-                if fivemer_entropy(seq) < 4:
-                    continue
-                q_seqs[r.name] = seq
-
-
-    pool = mp.Pool(args.n_core)
-    K = 14
-    build_look_up(seqs, K)
-    m_pool = mp.Pool(args.d_core)
-
-
-    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
-    for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
-        for h in r:
-            print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_sense.py b/FALCON/src/py_scripts_v0.1/falcon_sense.py
deleted file mode 100644
index 26f1954..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_sense.py
+++ /dev/null
@@ -1,248 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from ctypes import *
-import sys
-from multiprocessing import Pool
-import os
-import falcon_kit
-
-module_path = falcon_kit.__path__[0]
-
-falcon = CDLL(os.path.join(module_path, "falcon.so"))
-
-falcon.generate_consensus.argtypes = [ POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double ]
-falcon.generate_consensus.restype = POINTER(falcon_kit.ConsensusData)
-falcon.free_consensus_data.argtypes = [ POINTER(falcon_kit.ConsensusData) ]
-
-
-def get_alignment(seq1, seq0, edge_tolerance = 1000):
-
-    kup = falcon_kit.kup
-    K = 8
-    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-    sa_ptr = kup.allocate_seq( len(seq0) )
-    sda_ptr = kup.allocate_seq_addr( len(seq0) )
-    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
-    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
-    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
-    aln_range = aln_range_ptr[0]
-    kup.free_kmer_match(kmer_match_ptr)
-    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score
-    e1 += K + K/2
-    e0 += K + K/2
-    kup.free_aln_range(aln_range)
-    len_1 = len(seq1)
-    len_0 = len(seq0)
-    if e1 > len_1:
-        e1 = len_1
-    if e0 > len_0:
-        e0 = len_0
-
-    aln_size = 1
-    if e1 - s1 > 500:
-
-        aln_size = max( e1-s1, e0-s0 )
-        aln_score = int(km_score * 48)
-        aln_q_s = s1
-        aln_q_e = e1
-        aln_t_s = s0
-        aln_t_e = e0
-
-    kup.free_seq_addr_array(sda_ptr)
-    kup.free_seq_array(sa_ptr)
-    kup.free_kmer_lookup(lk_ptr)
-
-    if s1 > edge_tolerance and s0 > edge_tolerance:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-    if len_1 - e1 > edge_tolerance and len_0 - e0 > edge_tolerance:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-
-    if e1 - s1 > 500 and aln_size > 500:
-        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
-    else:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-def get_consensus_without_trim( c_input ):
-    seqs, seed_id, config = c_input
-    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
-    if len(seqs) > max_n_read:
-        seqs = seqs[:max_n_read]
-    seqs_ptr = (c_char_p * len(seqs))()
-    seqs_ptr[:] = seqs
-    consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(seqs), min_cov, K,
-                                                    local_match_count_window, local_match_count_threshold, min_idt )
-
-    consensus = string_at(consensus_data_ptr[0].sequence)[:]
-    eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
-    falcon.free_consensus_data( consensus_data_ptr )
-    del seqs_ptr
-    return consensus, seed_id
-
-def get_consensus_with_trim( c_input ):
-    seqs, seed_id, config = c_input
-    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
-    trim_seqs = []
-    seed = seqs[0]
-    for seq in seqs[1:]:
-        aln_data = get_alignment(seq, seed, edge_tolerance)
-        s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
-        if c_status == "none":
-            continue
-        if aln_score > 1000 and e1 - s1 > 500:
-            e1 -= trim_size
-            s1 += trim_size
-            trim_seqs.append( (e1-s1, seq[s1:e1]) )
-    trim_seqs.sort(key = lambda x:-x[0]) #use longest alignment first
-    trim_seqs = [x[1] for x in trim_seqs]
-
-    if len(trim_seqs) > max_n_read:
-        trim_seqs = trim_seqs[:max_n_read]
-
-    trim_seqs = [seed] + trim_seqs
-
-
-    seqs_ptr = (c_char_p * len(trim_seqs))()
-    seqs_ptr[:] = trim_seqs
-    consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(trim_seqs), min_cov, K,
-                                               local_match_count_window, local_match_count_threshold, min_idt )
-    consensus = string_at(consensus_data_ptr[0].sequence)[:]
-    eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
-    falcon.free_consensus_data( consensus_data_ptr )
-    del seqs_ptr
-    return consensus, seed_id
-
-
-def get_seq_data(config):
-    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
-    seqs = []
-    seed_id = None
-    seqs_data = []
-    read_ids = set()
-    with sys.stdin as f:
-        for l in f:
-            l = l.strip().split()
-            if len(l) != 2:
-                continue
-            if l[0] not in ("+", "-"):
-                if len(l[1]) > 100:
-                    if len(seqs) == 0:
-                        seqs.append(l[1]) #the "seed"
-                        seed_id = l[0]
-                    if l[0] not in read_ids: #avoidng using the same read twice
-                        seqs.append(l[1])
-            elif l[0] == "+":
-                if len(seqs) > 10:
-                    seqs.sort( key=lambda x: -len(x) )
-                    yield (seqs[:max_n_read], seed_id, config)
-                #seqs_data.append( (seqs, seed_id) )
-                seqs = []
-                read_id = set()
-                seed_id = None
-            elif l[0] == "-":
-                #yield (seqs, seed_id)
-                #seqs_data.append( (seqs, seed_id) )
-                break
-
-if __name__ == "__main__":
-    import argparse
-    import re
-    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
-    parser.add_argument('--n_core', type=int, default=24,
-                        help='number of processes used for generating consensus')
-    parser.add_argument('--local_match_count_window', type=int, default=12,
-                        help='local match window size')
-    parser.add_argument('--local_match_count_threshold', type=int, default=6,
-                        help='local match count threshold')
-    parser.add_argument('--min_cov', type=int, default=6,
-                        help='minimum coverage to break the consensus')
-    parser.add_argument('--max_n_read', type=int, default=500,
-                        help='minimum number of reads used in generating the consensus')
-    parser.add_argument('--trim', action="store_true", default=False,
-                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
-    parser.add_argument('--output_full', action="store_true", default=False,
-                        help='output uncorrected regions too')
-    parser.add_argument('--output_multi', action="store_true", default=False,
-                        help='output multi correct regions')
-    parser.add_argument('--min_idt', type=float, default=0.70,
-                        help='minimum identity of the alignments used for correction')
-    parser.add_argument('--edge_tolerance', type=int, default=1000,
-                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
-    parser.add_argument('--trim_size', type=int, default=50,
-                        help='the size for triming both ends from initial sparse aligned region')
-    good_region = re.compile("[ACGT]+")
-    args = parser.parse_args()
-    exe_pool = Pool(args.n_core)
-    if args.trim:
-        get_consensus = get_consensus_with_trim
-    else:
-        get_consensus = get_consensus_without_trim
-
-    K = 8
-    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
-             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size
-    for res in exe_pool.imap(get_consensus, get_seq_data(config)):
-        cns, seed_id = res
-        if args.output_full == True:
-            if len(cns) > 500:
-                print ">"+seed_id+"_f"
-                print cns
-        else:
-            cns = good_region.findall(cns)
-            if len(cns) == 0:
-                continue
-            if args.output_multi == True:
-                seq_i = 0
-                for cns_seq in cns:
-                    if len(cns_seq) > 500:
-                        print ">"+seed_id+"_%d" % seq_i
-                        print cns_seq
-                    seq_i += 1
-            else:
-                cns.sort(key = lambda x: len(x))
-                if len(cns[-1]) > 500:
-                    print ">"+seed_id
-                    print cns[-1]
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_ucns_data.py b/FALCON/src/py_scripts_v0.1/falcon_ucns_data.py
deleted file mode 100644
index 7d206fd..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_ucns_data.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import sys
-import os
-
-
-rcmap = dict(zip("ACGTacgtNn-", "TGCAtgcaNn-"))
-
-if __name__ == "__main__":
-    import argparse
-    import re
-    from pbcore.io import FastaReader
-
-    tiling_path = {}
-    with open("all_tiling_path_c") as f:
-        for l in f:
-            l = l.strip().split()
-            tiling_path.setdefault( l[0], [])
-
-            offset = int(l[1])
-            node_id = l[2].split(":")
-            s = int(l[3])
-            e = int(l[4])
-
-            tiling_path[ l[0] ].append( (offset, node_id[0], node_id[1], s, e) )
-
-    f = FastaReader("preads.fa")
-    seq_db = {}
-    for r in f:
-         seq_db[r.name] = r.sequence
-
-    f = FastaReader("primary_tigs_c.fa")
-    p_tigs_db = {}
-    for r in f:
-         p_tigs_db[r.name] = r.sequence
-
-    for p_tig_id in p_tigs_db:
-        pread_data = {}
-        offsets = []
-        seqs = []
-        p_tig = p_tigs_db[p_tig_id]
-        #if len(tiling_path[p_tig_id]) <= 2:
-        #    continue
-        print p_tig_id, 0, p_tig
-        for offset, s_id, end, s, e in tiling_path[p_tig_id]:
-            seq = seq_db[s_id]
-            if end == "B":
-                s, e = e, s
-                offset = offset - len(seq)
-                seq = "".join([rcmap[c] for c in seq[::-1]])
-            else:
-                offset = offset - len(seq)
-            print s_id, offset, seq
-
-        print "+ + +"
-
-    f = FastaReader("a_nodup.fa")
-    a_tigs_db = {}
-    for r in f:
-         a_tigs_db[r.name] = r.sequence
-
-    for a_tig_id in a_tigs_db:
-        pread_data = {}
-        offsets = []
-        seqs = []
-        a_tig = a_tigs_db[a_tig_id]
-        #if len(tiling_path[a_tig_id]) <= 2:
-        #    continue
-        print a_tig_id, 0, a_tig
-        for offset, s_id, end, s, e in tiling_path[a_tig_id]:
-            seq = seq_db[s_id]
-            if end == "B":
-                s, e = e, s
-                offset = offset - len(seq)
-                seq = "".join([rcmap[c] for c in seq[::-1]])
-            else:
-                offset = offset - len(seq)
-            print s_id, offset, seq
-
-        print "+ + +"
-
-    print "- - -"
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_utgcns.py b/FALCON/src/py_scripts_v0.1/falcon_utgcns.py
deleted file mode 100644
index bd2cc1b..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_utgcns.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from ctypes import *
-import sys
-from multiprocessing import Pool
-import os
-import falcon_kit
-
-module_path = falcon_kit.__path__[0]
-
-falcon = CDLL(os.path.join(module_path, "falcon.so"))
-"""
-consensus_data * generate_utg_consensus( char ** input_seq,
-                           seq_coor_t *offset,
-                           unsigned int n_seq,
-                           unsigned min_cov,
-                           unsigned K,
-                           double min_idt) {
-"""
-falcon.generate_utg_consensus.argtypes = [ POINTER(c_char_p), POINTER(falcon_kit.seq_coor_t), c_uint, c_uint, c_uint, c_double ]
-falcon.generate_utg_consensus.restype = POINTER(falcon_kit.ConsensusData)
-falcon.free_consensus_data.argtypes = [ POINTER(falcon_kit.ConsensusData) ]
-
-rcmap = dict(zip("ACGTacgtNn-", "TGCAtgcaNn-"))
-
-def get_consensus(c_input):
-    t_id, seqs, offsets, config = c_input
-    K = config[0]
-    seqs_ptr = (c_char_p * len(seqs))()
-    seqs_ptr[:] = seqs
-    offset_ptr = (c_long * len(seqs))( *offsets )
-    consensus_data_ptr = falcon.generate_utg_consensus( seqs_ptr, offset_ptr, len(seqs), 0, K, 0.)
-    consensus = string_at(consensus_data_ptr[0].sequence)[:]
-    del seqs_ptr
-    del offset_ptr
-    falcon.free_consensus_data( consensus_data_ptr )
-    return consensus, t_id
-
-def echo(c_input):
-
-    t_id, seqs, offsets, config = c_input
-
-    return len(seqs), "test"
-
-def get_seq_data(config):
-    seqs = []
-    offsets = []
-    seed_id = None
-    with sys.stdin as f:
-        for l in f:
-            l = l.strip().split()
-            if len(l) != 3:
-                continue
-            if l[0] not in ("+", "-"):
-                if len(seqs) == 0:
-                    seqs.append(l[2]) #the "seed"
-                    offsets.append( int(l[1]) )
-                    seed_id = l[0]
-                else:
-                    seqs.append(l[2])
-                    offsets.append( int(l[1]) )
-            elif l[0] == "+":
-                yield (seed_id, seqs, offsets, config)
-                seqs = []
-                offsets = []
-                seed_id = None
-            elif l[0] == "-":
-                break
-
-if __name__ == "__main__":
-    import argparse
-    import re
-    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
-    parser.add_argument('--n_core', type=int, default=4,
-                        help='number of processes used for generating consensus')
-    args = parser.parse_args()
-    exe_pool = Pool(args.n_core)
-    K = 8
-    config = (K, )
-    for res in exe_pool.imap(get_consensus, get_seq_data(config)):
-    #for res in exe_pool.imap(echo, get_seq_data(config)):
-    #for res in map(echo, get_seq_data(config)):
-    #for res in map(get_consensus, get_seq_data(config)):
-        cns, t_id = res
-        print ">"+t_id+"|tigcns"
-        print cns
-
diff --git a/FALCON/src/py_scripts_v0.1/get_ovl.sh b/FALCON/src/py_scripts_v0.1/get_ovl.sh
deleted file mode 100644
index 417f03b..0000000
--- a/FALCON/src/py_scripts_v0.1/get_ovl.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step1.py > {}.ignore" ::: *.las
-rm all.ignore
-cat *.ignore > all.ignore
-/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step2.py > {}.rc" ::: *.las
-cat *.rc > rc_out_all
-rm *.rc
-/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/FALCON/src/py_scripts_v0.1/get_rdata.py b/FALCON/src/py_scripts_v0.1/get_rdata.py
deleted file mode 100755
index f4fbf99..0000000
--- a/FALCON/src/py_scripts_v0.1/get_rdata.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import sys
-import glob
-#import pkg_resources
-import uuid
-from datetime import datetime
-
-from collections import Counter
-from multiprocessing import Pool
-#from pbtools.pbdagcon.q_sense import *
-import os
-
-"""
-try:
-    __p4revision__ = "$Revision: #4 $"
-    __p4change__ = "$Change: 121571 $"
-    revNum = int(__p4revision__.strip("$").split(" ")[1].strip("#"))
-    changeNum = int(__p4change__.strip("$").split(":")[-1])
-    __version__ = "%s-r%d-c%d" % ( pkg_resources.require("pbtools.pbhgap")[0].version, revNum, changeNum )
-except:
-    __version__ = "pbtools.hbar-dtk-github"
-"""
-
-query_fasta_fn = sys.argv[1]
-target_fasta_fn = sys.argv[2]
-m4_fofn = sys.argv[3]
-bestn = int(sys.argv[4])
-group_id = int(sys.argv[5])
-num_chunk = int(sys.argv[6])
-min_cov = int(sys.argv[7])
-max_cov = int(sys.argv[8])
-trim_align = int(sys.argv[9])
-trim_plr = int(sys.argv[10])
-
-
-rmap = dict(zip("ACGTNacgt-","TGCANntgca-"))
-def rc(seq):
-    return "".join([rmap[c] for c in seq[::-1]])
-
-"""0x239fb832/0_590 0x722a1e26 -1843 81.6327 0 62 590 590 0 6417 6974 9822 254 11407 -74.5375 -67.9 1"""
-query_to_target = {}
-with open(m4_fofn) as fofn:
-    for fn in fofn:
-        fn = fn.strip()
-        with open(fn) as m4_f:
-            for l in m4_f:
-                d = l.strip().split()
-                id1, id2 = d[:2]
-                #if -noSplitSubread not used, we will need the following line
-                #id1 = id1.split("/")[0]
-                if id1 == id2:
-                    continue
-                if hash(id2) % num_chunk != group_id:
-                    continue
-                if int(d[2]) > -1000: continue
-                if int(d[11]) < 4000: continue
-                query_to_target.setdefault(id1, [])
-                query_to_target[id1].append( (int(d[2]), l) )
-
-target_to_query = {}
-for id1 in query_to_target:
-    query_to_target[id1].sort()
-    rank = 0
-    for s, ll in query_to_target[id1][:bestn]:
-        l = ll.strip()
-        d = l.split()
-        id1, id2 = d[:2]
-        target_to_query.setdefault(id2,[])
-        target_to_query[id2].append( ( (int(d[5])-int(d[6]), int(d[2])), l ) )
-        #target_to_query[id2].append( ( int(d[2]), l ) )
-        #rank += 1
-
-from pbcore.io import FastaIO
-query_data = {}
-with open(query_fasta_fn) as fofn:
-    for fa_fn in fofn:
-        fa_fn = fa_fn.strip()
-        f_s = FastaIO.FastaReader(fa_fn)
-        for s in f_s:
-            id1 = s.name
-            if id1 not in query_to_target:
-                continue
-            query_data[id1]=s.sequence
-        f_s.file.close()
-
-target_data = {}
-with open(target_fasta_fn) as fofn:
-    for fa_fn in fofn:
-        fa_fn = fa_fn.strip()
-        f_s = FastaIO.FastaReader(fa_fn)
-        for s in f_s:
-            id2 = s.name
-            if hash(id2) % num_chunk != group_id:
-                continue
-            target_data[id2]=s.sequence
-        f_s.file.close()
-
-
-ec_data = []
-base_count = Counter()
-r_count =0
-
-for id2 in target_to_query:
-    if len(target_to_query[id2])<10:
-        continue
-    if id2 not in target_data:
-        continue
-
-    ref_data = (id2, target_data[id2])
-    ref_len = len(target_data[id2])
-    base_count.clear()
-    base_count.update( target_data[id2] )
-    if 1.0*base_count.most_common(1)[0][1]/ref_len > 0.8:  # don't do preassmbly if a read is of >80% of the same base
-        continue
-    read_data = []
-
-    query_alignment = target_to_query[id2]
-    query_alignment.sort() # get better alignment
-    total_bases = 0
-    max_cov_bases = max_cov * ref_len * 1.2
-    #min_cov_bases = min_cov * ref_len * 3
-
-    for rank_score, l in query_alignment:
-        rank, score = rank_score
-        #score = rank_score
-        l = l.split()
-        id1 = l[0]
-        #if -noSplitSubread not used, we will need the following line
-        #id1 = id1.split("/")[0]
-        q_s = int(l[5]) + trim_align
-        q_e = int(l[6]) - trim_align
-        strand = int(l[8])
-        t_s = int(l[9])
-        t_e = int(l[10])
-        t_l = int(l[11])
-        #if strand == 1:
-        #    t_s, t_e = t_l - t_e, t_l - t_s
-        #    t_s += trim_align
-        #    t_e -= trim_align
-
-        if q_e - q_s < 400:
-            continue
-        total_bases += q_e - q_s
-        if total_bases > max_cov_bases:
-            break
-        q_seq = query_data[id1][q_s:q_e]
-        read_data.append( ( "%s/0/%d_%d" % (id1, q_s, q_e), q_s, q_e, q_seq, strand, t_s, t_e) )
-
-    if len(read_data) > 5:
-        r_count += 1
-        t_id, t_seq = ref_data
-        t_len = len(t_seq)
-        print t_id, t_seq
-        for r in read_data:
-            q_id, q_s, q_e, q_seq, strand, t_s, t_e = r
-            if strand == 1:
-                q_seq = rc(q_seq)
-            print q_id, q_seq
-        #if r_count > 600:
-        #    break
-        print "+ +"
-print "- -"
-
-#output_dir,dumb = os.path.split( os.path.abspath( output_file ) )
-#output_log = open ( os.path.join( output_dir, "j%02d.log" % group_id ), "w" )
-
-
-
-
diff --git a/FALCON/src/py_scripts_v0.1/overlapper.py b/FALCON/src/py_scripts_v0.1/overlapper.py
deleted file mode 100644
index 3a040d2..0000000
--- a/FALCON/src/py_scripts_v0.1/overlapper.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from falcon_kit import kup, falcon, DWA, get_consensus, get_alignment
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-
-seqs = []
-q_seqs = {}
-f = FastaReader(sys.argv[1]) # take one commnad line argument of the input fasta file name
-
-for r in f:
-    if len(r.sequence) < 6000:
-        continue
-    seq = r.sequence.upper()
-    seqs.append( (r.name, seq[:500], seq[-500:] ) )
-    q_seqs[r.name] = seq
-
-
-total_index_base = len(seqs) * 1000
-print total_index_base
-sa_ptr = kup.allocate_seq( total_index_base )
-sda_ptr = kup.allocate_seq_addr( total_index_base )
-K=14
-lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-
-start = 0
-for r_name, prefix, suffix in seqs:
-    kup.add_sequence( start, K, prefix, 500, sda_ptr, sa_ptr, lk_ptr)
-    start += 500
-    kup.add_sequence( start, K, suffix, 500, sda_ptr, sa_ptr, lk_ptr)
-    start += 500
-#kup.mask_k_mer(1 << (K * 2), lk_ptr, 256)
-
-kup.mask_k_mer(1 << (K * 2), lk_ptr, 64)
-
-def get_alignment(seq1, seq0):
-
-    K = 8
-    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-    sa_ptr = kup.allocate_seq( len(seq0) )
-    sda_ptr = kup.allocate_seq_addr( len(seq0) )
-    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
-    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
-    kup.free_kmer_match(kmer_match_ptr)
-    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
-    if e1 - s1 > 500:
-        #s1 = 0 if s1 < 14 else s1 - 14
-        #s2 = 0 if s2 < 14 else s2 - 14
-        e1 = len(seq1) if e1 >= len(seq1)-2*K else e1 + K*2
-        e2 = len(seq0) if e2 >= len(seq0)-2*K else e2 + K*2
-
-        alignment = DWA.align(seq1[s1:e1], e1-s1,
-                              seq0[s2:e2], e2-s2,
-                              100, 0)
-        #print seq1[s1:e1]
-        #print seq0[s2:e2]
-        #if alignment[0].aln_str_size > 500:
-
-        #aln_str1 = alignment[0].q_aln_str
-        #aln_str0 = alignment[0].t_aln_str
-        aln_size = alignment[0].aln_str_size
-        aln_dist = alignment[0].dist
-        aln_q_s = alignment[0].aln_q_s
-        aln_q_e = alignment[0].aln_q_e
-        aln_t_s = alignment[0].aln_t_s
-        aln_t_e = alignment[0].aln_t_e
-        assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
-        #print aln_str1
-        #print aln_str0
-
-        DWA.free_alignment(alignment)
-
-    kup.free_seq_addr_array(sda_ptr)
-    kup.free_seq_array(sa_ptr)
-    kup.free_kmer_lookup(lk_ptr)
-    if e1 - s1 > 500 and aln_size > 500:
-        return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist
-    else:
-        return None
-
-
-def get_ovelap_alignment(seq1, seq0):
-
-    K = 8
-    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-    sa_ptr = kup.allocate_seq( len(seq0) )
-    sda_ptr = kup.allocate_seq_addr( len(seq0) )
-    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
-    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
-    kmer_match = kmer_match_ptr[0]
-    aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
-    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
-    kup.free_kmer_match(kmer_match_ptr)
-    s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
-    len_1 = len(seq1)
-    len_0 = len(seq0)
-    do_aln = False
-    contain_status = "none"
-    if e1 - s1 > 500:
-        if s1 < 100 and len_1 - e1 < 100:
-            do_aln = False
-            contain_status = "contains"
-        elif s0 < 100 and len_0 - e0 < 100:
-            do_aln = False
-            contain_status = "contained"
-        else:
-            do_aln = True
-            if s0 < s1:
-                s1 -= s0 #assert s1 > 0
-                s0 = 0
-                e1 = len_1
-                e0 = len_1 - s1 if len_1 - s1 < len_0 else len_0
-                if e0 == len_0:
-                    do_aln = False
-                    contain_status = "contained"
-
-            if s1 <= s0:
-                s0 -= s1 #assert s1 > 0
-                s1 = 0
-                e0 = len_0
-                e1 = len_0 - s0 if len_0 - s0 < len_1 else len_1
-                if e1 == len_1:
-                    do_aln = False
-                    contain_status = "contains"
-
-
-        if do_aln:
-            alignment = DWA.align(seq1[s1:e1], e1-s1,
-                                  seq0[s0:e0], e0-s0,
-                                  500, 0)
-            #print seq1[s1:e1]
-            #print seq0[s2:e2]
-            #if alignment[0].aln_str_size > 500:
-
-            #aln_str1 = alignment[0].q_aln_str
-            #aln_str0 = alignment[0].t_aln_str
-            aln_size = alignment[0].aln_str_size
-            aln_dist = alignment[0].dist
-            aln_q_s = alignment[0].aln_q_s
-            aln_q_e = alignment[0].aln_q_e
-            aln_t_s = alignment[0].aln_t_s
-            aln_t_e = alignment[0].aln_t_e
-            assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
-            #print aln_str1
-            #print aln_str0
-            if aln_size > 500:
-                contain_status = "overlap"
-            DWA.free_alignment(alignment)
-
-    kup.free_seq_addr_array(sda_ptr)
-    kup.free_seq_array(sa_ptr)
-    kup.free_kmer_lookup(lk_ptr)
-
-    if e1 - s1 > 500 and do_aln and aln_size > 500:
-        #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
-        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
-    else:
-        return 0, 0, 0, 0, 0, 0, contain_status
-
-rc_map = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-with open("test_ovlp.dat","w") as f:
-    for name, q_seq in q_seqs.items():
-        kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
-        kmer_match = kmer_match_ptr[0]
-        count = kmer_match.count
-        hit_index = np.array(kmer_match.target_pos[0:count])/500
-        kup.free_kmer_match(kmer_match_ptr)
-
-        c = collections.Counter(hit_index)
-        s = [c[0] for c in c.items() if c[1] >50]
-        #s.sort()
-        targets = set()
-        for p in s:
-            hit_id = seqs[p/2][0]
-            if hit_id in targets or hit_id == name:
-                continue
-            targets.add(hit_id)
-            seq1, seq0 = q_seq, q_seqs[hit_id ]
-            rtn = get_ovelap_alignment(seq1, seq0)
-            #rtn = get_alignment(seq1, seq0)
-            if rtn != None:
-
-                s1, e1, s2, e2, aln_size, aln_dist, c_status = rtn
-                #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
-                print >>f, hit_id, name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 0, s2, e2, len(seq0), 0, s1, e1, len(seq1), c_status
-
-        r_q_seq = "".join([rc_map[c] for c in q_seq[::-1]])
-
-        kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, sda_ptr, lk_ptr)
-        kmer_match = kmer_match_ptr[0]
-        count = kmer_match.count
-        hit_index = np.array(kmer_match.target_pos[0:count])/500
-        kup.free_kmer_match(kmer_match_ptr)
-
-        c = collections.Counter(hit_index)
-        s = [c[0] for c in c.items() if c[1] >50]
-        #s.sort()
-        targets = set()
-        for p in s:
-            hit_id = seqs[p/2][0]
-            if hit_id in targets or hit_id == name:
-                continue
-            targets.add(hit_id)
-            seq1, seq0 = r_q_seq, q_seqs[hit_id]
-            rtn = get_ovelap_alignment(seq1, seq0)
-            #rtn = get_alignment(seq1, seq0)
-            if rtn != None:
-                s1, e1, s2, e2, aln_size, aln_dist, c_status = rtn
-                #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
-                print >>f, hit_id, name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 0, s2, e2, len(seq0), 1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status
-
diff --git a/FALCON/src/py_scripts_v0.1/ovlp_filter.sh b/FALCON/src/py_scripts_v0.1/ovlp_filter.sh
deleted file mode 100644
index 608389e..0000000
--- a/FALCON/src/py_scripts_v0.1/ovlp_filter.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-source /mnt/secondary/Share/HBAR_03202013/bin/activate
-parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step1.py > {}.ignore" ::: *.las
-cat *.ignore > all.ignore
-parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step2.py > {}.rc" ::: *.las
-cat *.rc > rc_out_all
-parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/FALCON/src/py_scripts_v0.1/redis_graph.py b/FALCON/src/py_scripts_v0.1/redis_graph.py
deleted file mode 100644
index 555c090..0000000
--- a/FALCON/src/py_scripts_v0.1/redis_graph.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import redis
-import sys
-from pbcore.io import FastaReader
-
-
-r = redis.StrictRedis(host='localhost', port=6379, db=0)
-
-class RedisList(object):
-
-    def __init__(self, rs):
-        self._rs = rs
-        self.id_ = "pid:" + str( id(self) )
-
-    def append(self, value):
-        self._rs.rpush( self.id_, value)
-
-    def __len__(self):
-        return self._rs.llen( self.id_ )
-
-    def __getitem__(self, i):
-        return self._rs.lrange( self.id_, i, i)
-
-    def pylist(self):
-        return self._rs.lrange( self.id_, 0, -1)
-
-    def __del__(self):
-        self._rs.delete(self.id_)
-
-class RedisDict(object):
-
-    def __init__(self, rs):
-        self._rs = rs
-        self.id_ = "pid:" + str( id(self) )
-
-    def __setitem__(self, key, value):
-        self._rs.hset( self.id_, key, value )
-
-    def __getitem__(self, key):
-        return self._rs.hget( self.id_, key )
-
-    def __delitem__(self, key):
-        return self._rs.hdel( self.id_, key)
-
-
-    def __len__(self):
-        return self._rs.hlen( self.id_ )
-
-    def keys(self):
-        return self._rs.hgetall( self.id_ ).keys()
-
-    def values(self):
-        return self._rs.hgetall( self.id_ ).values()
-
-    def pydict(self):
-        return self._rs.hgetall( self.id_ )
-
-    def __del__(self):
-        self._rs.delete(self.id_)
-
-def test_list():
-    x = RedisList(r)
-    x.append( "1" )
-    x.append( "2" )
-    print len(x)
-    print x.pylist()
-    del x
-
-    y = RedisDict(r)
-    y["a"] = "b"
-    y["b"] = 1
-    print y["a"]
-    del y["a"]
-    print y.values()
-    print y.keys()
-    print y.pydict()
-    del y
-
-if __name__ == "__main__":
-    test_list()
diff --git a/FALCON/src/py_scripts_v0.1/remove_dup_ctg.py b/FALCON/src/py_scripts_v0.1/remove_dup_ctg.py
deleted file mode 100755
index 3164eb6..0000000
--- a/FALCON/src/py_scripts_v0.1/remove_dup_ctg.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import pbcore.io
-
-import sys
-"""nucmer -maxmatch all_tigs.fa all_tigs.fa -p all_tigs_self >& /dev/null"""
-"""show-coords -o -H -T all_tigs_self.delta | grep CONTAINS | awk '$7>96' | awk '{print $9}' | sort -u > all_tigs_duplicated_ids"""
-
-id_to_remove = set()
-with open("all_tigs_duplicated_ids") as f:
-    for l in f:
-        l = l.strip().split("-")
-        major, minor = l[:2]
-        id_to_remove.add ( (major, minor) )
-
-f = pbcore.io.FastaReader("all_tigs.fa")
-with open("a-tigs_nodup.fa", "w") as f_out:
-    for r in f:
-        major, minor = r.name.split()[0].split("-")[:2]
-        if minor == "0000":
-            continue
-        if (major, minor) in id_to_remove:
-            continue
-        if len(r.sequence) < 500:
-            continue
-        print >>f_out, ">"+r.name
-        print >>f_out, r.sequence
-
-f = pbcore.io.FastaReader("primary_tigs_c.fa")
-with open("p-tigs_nodup.fa", "w") as f_out:
-    for r in f:
-        major, minor = r.name.split()[0].split("_")[:2]
-        if (major, "0000") in id_to_remove:
-            continue
-        if len(r.sequence) < 500:
-            continue
-        print >>f_out, ">"+r.name
-        print >>f_out, r.sequence
diff --git a/FALCON/test/helpers.py b/FALCON/test/helpers.py
index 898f7db..a6820b4 100644
--- a/FALCON/test/helpers.py
+++ b/FALCON/test/helpers.py
@@ -1,4 +1,5 @@
 from nose.tools import assert_equal, assert_raises, eq_
+import os.path
 
 def equal_list(a, b):
     eq_(set(a) ^ set(b), set())
@@ -12,3 +13,6 @@ def equal_multiline(a, b):
     alines = a.splitlines()
     blines = b.splitlines()
     equal_list(alines, blines)
+
+def get_test_data_dir():
+    return os.path.join(os.path.dirname(__file__), '..', 'test_data')
diff --git a/FALCON/test/test_calc_cutoff.py b/FALCON/test/test_calc_cutoff.py
new file mode 100644
index 0000000..4ae0d8a
--- /dev/null
+++ b/FALCON/test/test_calc_cutoff.py
@@ -0,0 +1,43 @@
+import falcon_kit.mains.calc_cutoff as mod
+import helpers
+import os.path
+import pytest
+
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+
+# Note: genome_size==1 makes math easy.
+
+def test_calc_cutoff(capsys):
+    partial_capture_fn = os.path.join(helpers.get_test_data_dir(), 'calc_cutoff/partial_capture.txt')
+    assert os.path.exists(partial_capture_fn)
+    mod.main('prog --coverage 14 1 {}'.format(partial_capture_fn).split())
+    out, err = capsys.readouterr()
+    assert out == '2'
+    assert not err
+
+expected_err = """
+GenomeCoverageError: Not enough reads available for desired genome coverage (bases needed=23 > actual=22)
+User-provided genome_size: 1
+Desired coverage: 23.0
+"""
+
+def test_calc_cutoff_err():
+    partial_capture_fn = os.path.join(helpers.get_test_data_dir(), 'calc_cutoff/partial_capture.txt')
+    assert os.path.exists(partial_capture_fn)
+    with pytest.raises(Exception) as excinfo:
+        mod.main('prog --coverage 23 1 {}'.format(partial_capture_fn).split())
+    assert expected_err in str(excinfo.value)
+
+def test_calc_cutoff_errfile(monkeypatch, tmpdir):
+    fn = str(tmpdir.mkdir('tmp').join('errfile'))
+    monkeypatch.setenv('PBFALCON_ERRFILE', fn)
+    partial_capture_fn = os.path.join(helpers.get_test_data_dir(), 'calc_cutoff/partial_capture.txt')
+    assert os.path.exists(partial_capture_fn)
+    with pytest.raises(Exception) as excinfo:
+        mod.main('prog --coverage 23 1 {}'.format(partial_capture_fn).split())
+    assert expected_err in str(excinfo.value)
+    assert expected_err in open(fn).read()
diff --git a/FALCON/test/test_functional.py b/FALCON/test/test_functional.py
index bbcacaf..81c5ff3 100644
--- a/FALCON/test/test_functional.py
+++ b/FALCON/test/test_functional.py
@@ -1,5 +1,6 @@
 import helpers
 from nose.tools import assert_equal, assert_raises, eq_
+import pytest
 import falcon_kit.functional as f
 import StringIO
 import collections
@@ -18,6 +19,12 @@ def test_get_daligner_job_descriptions():
     helpers.equal_multiline(result[('.2', '.1', '.2')], "daligner -v -h1 -t16 -H1 -e0.7 -l1 -s1000 raw_reads.2 raw_reads.1 raw_reads.2\nLAcheck -v raw_reads *.las\nLAsort -v raw_reads.1.raw_reads.2.C0 raw_reads.1.raw_reads.2.N0 && LAmerge -v L1.1.2 raw_reads.1.raw_reads.2.C0.S raw_reads.1.raw_reads.2.N0.S && rm raw_reads.1.raw_reads.2.C0.S.las raw_reads.1.raw_reads.2.N0.S.las\nLAsort -v raw_reads.2.raw_reads.1.C0 raw_reads.2.raw_reads.1.N0 && LAmerge -v L1.2.1 raw_reads.2.raw_reads.1.C0. [...]
     eq_(len(result), 2)
 
+def test_get_daligner_job_descriptions_with_bad_arg():
+    with pytest.raises(AssertionError) as excinfo:
+        f.get_daligner_job_descriptions(
+                'fake_filename.txt', 'raw_reads')
+    assert "['f', 'a', 'k', 'e'" in str(excinfo.value)
+
 def test_get_daligner_job_descriptions_small():
     # when there is only 1 block, a special case
     example_HPCdaligner = open(example_HPCdaligner_small_fn)
@@ -153,6 +160,13 @@ def test_calc_cutoff():
     got = f.calc_cutoff(target, partial_capture)
     eq_(expected, got)
 
+def test_calc_cutoff_bad_coverage():
+    target = 23 # > 22 available
+    expected_message = 'Not enough reads available for desired genome coverage (bases needed=23 > actual=22)'
+    with assert_raises(f.GenomeCoverageError) as ctx:
+        f.calc_cutoff(target, partial_capture)
+    eq_(expected_message, ctx.exception.message)
+
 sample_DBdump_output = """+ R 2
 + M 0
 + H 400
diff --git a/FALCON/test/test_stats_preassembly.py b/FALCON/test/test_stats_preassembly.py
index 628e479..26a3b9b 100644
--- a/FALCON/test/test_stats_preassembly.py
+++ b/FALCON/test/test_stats_preassembly.py
@@ -2,11 +2,16 @@ import falcon_kit.stats_preassembly as M
 import helpers
 from cStringIO import StringIO
 
+def test_stats_from_sorted_readlengths():
+    stats = M.stats_from_sorted_readlengths([1,2,3,4])
+    expected = M.Stats(nreads=4, total=10, n50=3, p95=4, esize=3.0)
+    helpers.assert_equal(stats, expected)
+
 def test_stats_dict():
     #Stats = collections.namedtuple('FastaStats', ['nreads', 'total', 'n50', 'p95'])
-    stats_raw_reads = M.Stats(100, 1000, 50, 95)
-    stats_seed_reads = M.Stats(50, 500, 25, 40)
-    stats_corrected_reads = M.Stats(10, 100, 5, 9)
+    stats_raw_reads = M.Stats(100, 1000, 50, 95, 0.0)
+    stats_seed_reads = M.Stats(50, 500, 25, 40, 0.0)
+    stats_corrected_reads = M.Stats(10, 100, 5, 9, 0.0)
     genome_length = 19
     length_cutoff = 10
     frag = 1.0
@@ -25,16 +30,20 @@ def test_stats_dict():
  'preassembled_seed_fragmentation': 1.0,
  'preassembled_seed_truncation': 2.5,
  'preassembled_yield': 0.2,
+ 'preassembled_esize': 0.0,
  'raw_bases': 1000,
  'raw_coverage': 52.632,
  'raw_mean': 10.0,
  'raw_n50': 50,
  'raw_p95': 95,
  'raw_reads': 100,
+ 'raw_esize': 0.0,
  'seed_bases': 500,
  'seed_coverage': 26.316,
  'seed_mean': 10.0,
  'seed_n50': 25,
  'seed_p95': 40,
- 'seed_reads': 50}
+ 'seed_reads': 50,
+ 'seed_esize': 0.0,
+ }
     helpers.equal_dict(result, expected)
diff --git a/FALCON/test_data/calc_cutoff/partial_capture.txt b/FALCON/test_data/calc_cutoff/partial_capture.txt
new file mode 100644
index 0000000..4b49d9c
--- /dev/null
+++ b/FALCON/test_data/calc_cutoff/partial_capture.txt
@@ -0,0 +1,5 @@
+        Bin:      Count  % Reads  % Bases     Average
+          4:          2      0.0      0.0      xxx
+          3:          0      0.0      0.0      xxx
+          2:          3      0.0      0.0      xxx
+          1:          8      0.0      0.0      xxx
diff --git a/FALCON/travis.sh b/FALCON/travis.sh
index 27dcfb9..fd74da7 100755
--- a/FALCON/travis.sh
+++ b/FALCON/travis.sh
@@ -5,16 +5,9 @@
 set -vex
 
 #env | sort
-mkdir -p fc-env
-rm -f fc-env/bin/python
-virtualenv -p python2.7 fc-env || ../virtualenv/virtualenv.py fc-env
-. fc-env/bin/activate
-python setup.py -v install
-python -c 'import falcon_kit; print falcon_kit.falcon'
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+export PATH=$PYTHONUSERBASE/bin:$PATH
 
-# When doctests are passing, add this:
-pip install nose
-nosetests -v test/
-nosetests -v --with-doctest falcon_kit/functional.py
-# We cannot run that on *all* modules because some include dependencies.
-# Just pypeFLOW for now, but I would rather not test dependencies.
+make install
+make test
diff --git a/bamboo_build_and_test.sh b/bamboo_build_and_test.sh
new file mode 100755
index 0000000..5447bce
--- /dev/null
+++ b/bamboo_build_and_test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#rm -rf FALCON-integrate
+
+#git clone https://github.com/PacificBiosciences/FALCON-integrate
+#cd FALCON-integrate
+pwd
+ls -l
+git submodule
+git --version
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+module unload git gcc ccache
+module load git/2.8.3
+module load gcc/4.9.2
+module load ccache/3.2.3
+#module load make
+
+set -vex
+git --version
+which gcc
+which g++
+gcc --version
+# We cannot use /bin/python without /bin/gcc.
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+export CCACHE_DIR=/mnt/secondary/Share/tmp/bamboo.mobs.ccachedir
+
+git remote -v
+ls -larth
+pwd
+#git submodule update --init #No! We must use BB for some,
+# and rel URLs do not work for file://nothing
+env | sort
+MY_BRANCH=${bamboo_planRepository_branch}
+#git submodule foreach git pull origin ${MY_BRANCH}
+git submodule foreach git checkout ${MY_BRANCH}
+#git submodule update --init git-sym FALCON-make FALCON-examples
+git submodule
+./travis.sh
+ls -l $HOME/.ccache
+cat $HOME/.ccache/ccache.conf
+date --utc
diff --git a/makefile b/makefile
index d80b0a2..df20994 100644
--- a/makefile
+++ b/makefile
@@ -6,7 +6,6 @@ default:
 	@echo 'make config-???'
 	@echo 'make all'
 init:
-	git submodule update --init
 	cp -f default-env.sh env.sh
 config-edit:
 	bash ./FALCON-make/config-edit.sh
@@ -22,4 +21,9 @@ install:
 test:
 	${MAKE} -C ./FALCON-make/ $@
 
+update: # for creating new releases
+	git submodule update --remote
+	git add .
+	log-compares # my own tool
+
 .PHONY: init test
diff --git a/pypeFLOW/bamboo_build.sh b/pypeFLOW/bamboo_build.sh
new file mode 100644
index 0000000..fdea7dd
--- /dev/null
+++ b/pypeFLOW/bamboo_build.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+#type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+#module unload git gcc ccache
+#module load git/2.8.3
+#module load gcc/4.9.2
+#module load ccache/3.2.3
+##module load make
+
+set -vx
+#git --version
+#which gcc
+#which g++
+#gcc --version
+## We cannot use /bin/python without /bin/gcc.
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+
+#pip -v install --upgrade --user pip
+pip -v install --user .
+
+make pylint
+
+#python setup.py bdist_wheel
+
+nosetests -v --with-xunit --xunit-file=nose.doctest.xml --with-doctest pypeflow/ pwatcher/fs_based.py
diff --git a/pypeFLOW/makefile b/pypeFLOW/makefile
new file mode 100644
index 0000000..583be07
--- /dev/null
+++ b/pypeFLOW/makefile
@@ -0,0 +1,3 @@
+default:
+pylint:
+	pylint --errors-only pypeflow/ pwatcher/
diff --git a/pypeFLOW/pwatcher/blocking.py b/pypeFLOW/pwatcher/blocking.py
index 57a53c8..ae3e380 100755
--- a/pypeFLOW/pwatcher/blocking.py
+++ b/pypeFLOW/pwatcher/blocking.py
@@ -97,8 +97,8 @@ class State(object):
             jobid2status[jobid] = status
     def get_running_jobids(self):
         return list(self.jobids_submitted)
-    def serialize(state):
-        return pprint.pformat(state.top)
+    def serialize(self):
+        return pprint.pformat(self.top)
     @staticmethod
     def deserialize(directory, content):
         state = State(directory)
diff --git a/pypeFLOW/pwatcher/fs_based.py b/pypeFLOW/pwatcher/fs_based.py
index 1a203a8..0277d9a 100755
--- a/pypeFLOW/pwatcher/fs_based.py
+++ b/pypeFLOW/pwatcher/fs_based.py
@@ -128,8 +128,8 @@ class State(object):
         return {jobid: bjob.mjob for jobid, bjob in self.top['jobs'].iteritems()}
     def add_deleted_jobid(self, jobid):
         self.top['jobids_deleted'].append(jobid)
-    def serialize(state):
-        return pprint.pformat(state.top)
+    def serialize(self):
+        return pprint.pformat(self.top)
     @staticmethod
     def deserialize(directory, content):
         state = State(directory)
@@ -235,16 +235,18 @@ def background(script, exe='/bin/bash'):
     #system(checkcall, checked=True)
     return pid
 
-def qstripped(option):
+def qstripped(option, flag='-q'):
     """Given a string of options, remove any -q foo.
 
     >>> qstripped('-xy -q foo -z bar')
     '-xy -z bar'
+    >>> qstripped('-xy -p foo -z bar', '-p')
+    '-xy -z bar'
     """
     # For now, do not strip -qfoo
     vals = option.strip().split()
-    while '-q' in vals:
-        i = vals.index('-q')
+    while flag in vals:
+        i = vals.index(flag)
         vals = vals[0:i] + vals[i+2:]
     return ' '.join(vals)
 
@@ -285,10 +287,10 @@ class MetaJobSge(object):
         specific = self.specific
         #cwd = os.getcwd()
         job_name = self.get_jobname()
-        sge_option = qstripped(self.mjob.job.options['sge_option'])
-        if '-q' not in sge_option:
-            job_queue = self.mjob.job.options['job_queue']
-            sge_option = '-q {} '.format(job_queue) + sge_option
+        sge_option = self.mjob.job.options['sge_option']
+        job_queue = self.mjob.job.options['job_queue']
+        if job_queue:
+            sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
         # Add shebang, in case shell_start_mode=unix_behavior.
         #   https://github.com/PacificBiosciences/FALCON/pull/348
         with open(script_fn, 'r') as original: data = original.read()
@@ -331,10 +333,10 @@ usage: qsub [-a date_time] [-A account_string] [-c interval]
         specific = self.specific
         #cwd = os.getcwd()
         job_name = self.get_jobname()
-        sge_option = qstripped(self.mjob.job.options['sge_option'])
-        if '-q' not in sge_option:
-            job_queue = self.mjob.job.options['job_queue']
-            sge_option = '-q {} '.format(job_queue) + sge_option
+        sge_option = self.mjob.job.options['sge_option']
+        job_queue = self.mjob.job.options['job_queue']
+        if job_queue:
+            sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
         # Add shebang, in case shell_start_mode=unix_behavior.
         #   https://github.com/PacificBiosciences/FALCON/pull/348
         with open(script_fn, 'r') as original: data = original.read()
@@ -370,10 +372,10 @@ class MetaJobTorque(object):
         specific = self.specific
         #cwd = os.getcwd()
         job_name = self.get_jobname()
-        sge_option = qstripped(self.mjob.job.options['sge_option'])
-        if '-q' not in sge_option:
-            job_queue = self.mjob.job.options['job_queue']
-            sge_option = '-q {} '.format(job_queue) + sge_option
+        sge_option = self.mjob.job.options['sge_option']
+        job_queue = self.mjob.job.options['job_queue']
+        if job_queue:
+            sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
         cwd = os.getcwd()
         # Add shebang, in case shell_start_mode=unix_behavior.
         #   https://github.com/PacificBiosciences/FALCON/pull/348
@@ -407,10 +409,10 @@ class MetaJobSlurm(object):
         """Can raise.
         """
         job_name = self.get_jobname()
-        sge_option = qstripped(self.mjob.job.options['sge_option'])
-        if '-p' not in sge_option:
-            job_queue = self.mjob.job.options['job_queue']
-            sge_option = '-p {} '.format(job_queue) + sge_option
+        sge_option = self.mjob.job.options['sge_option']
+        job_queue = self.mjob.job.options['job_queue']
+        if job_queue:
+            sge_option = '-p {} '.format(job_queue) + qstripped(sge_option, '-p')
         cwd = os.getcwd()
         sge_cmd = 'sbatch -J {job_name} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
                 **locals())
@@ -441,10 +443,10 @@ class MetaJobLsf(object):
         """Can raise.
         """
         job_name = self.get_jobname()
-        sge_option = qstripped(self.mjob.job.options['sge_option'])
-        if '-q' not in sge_option:
-            job_queue = self.mjob.job.options['job_queue']
-            sge_option = '-q {} '.format(job_queue) + sge_option
+        sge_option = self.mjob.job.options['sge_option']
+        job_queue = self.mjob.job.options['job_queue']
+        if job_queue:
+            sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
         sge_cmd = 'bsub -J {job_name} {sge_option} -o stdout -e stderr "{exe} {script_fn}"'.format(
                 **locals())
         # "Sets the user's execution environment for the job, including the current working directory, file creation mask, and all environment variables, and sets LSF environment variables before starting the job."
@@ -657,7 +659,7 @@ def delete_heartbeat(state, heartbeat, keep=False):
     try:
         bjob = state.get_bjob(jobid)
     except Exception:
-        log.exception('In delete_heartbeat(), unable to find batchjob for % (from %s)' %(jobid, heartbeat))
+        log.exception('In delete_heartbeat(), unable to find batchjob for %s (from %s)' %(jobid, heartbeat))
         log.warning('Cannot delete. You might be able to delete this yourself if you examine the content of %s.' %heartbeat_fn)
         # TODO: Maybe provide a default grid type, so we can attempt to delete anyway?
         return
diff --git a/pypeFLOW/pwatcher/mains/pypeflow_example.py b/pypeFLOW/pwatcher/mains/pypeflow_example.py
index b71cc03..aee0d2b 100644
--- a/pypeFLOW/pwatcher/mains/pypeflow_example.py
+++ b/pypeFLOW/pwatcher/mains/pypeflow_example.py
@@ -1,6 +1,5 @@
-from pypeflow.pwatcher_bridge import PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase
-from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
-from pypeflow.task import PypeTask
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+        makePypeLocalFile, fn, PypeTask)
 import json
 import logging.config
 import os
@@ -102,10 +101,9 @@ def main():
         JOB_TYPE, SLEEP_S))
     exitOnFailure=False
     concurrent_jobs=2
-    #Workflow = pypeflow.controller.PypeThreadWorkflow
     Workflow = PypeProcWatcherWorkflow
-    Workflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
     wf = Workflow(job_type=JOB_TYPE)
+    wf.max_jobs = concurrent_jobs
 
     par = dict(sleep_s=SLEEP_S)
     DIR ='mytmp'
@@ -113,17 +111,17 @@ def main():
     f0 = makePypeLocalFile('mytmp/f0')
     f1 = makePypeLocalFile('mytmp/f1')
     make_task = PypeTask(
-            #inputs = {'f': f},
+            inputs = {},
             outputs = {'f0': f0},
             parameters = par,
-            TaskType = MyFakePypeThreadTaskBase)
+    )
     task = make_task(taskrun0)
     wf.addTasks([task])
     make_task = PypeTask(
             inputs = {'f0': f0},
             outputs = {'f1': f1},
             parameters = par,
-            TaskType = MyFakePypeThreadTaskBase)
+    )
     task = make_task(taskrun1)
     wf.addTasks([task])
     wf.refreshTargets([task])
diff --git a/pypeFLOW/pwatcher/mains/query_server.py b/pypeFLOW/pwatcher/mains/query_server.py
index 94604d5..d642730 100755
--- a/pypeFLOW/pwatcher/mains/query_server.py
+++ b/pypeFLOW/pwatcher/mains/query_server.py
@@ -65,8 +65,7 @@ def find_server(args):
     if args.sf:
         i += 1
     if i > 1:
-        print('Error: may only specify server once')
-        parser.print_usage()
+        raise Exception('Error: may only specify server once. Try "--help".')
         return
     if args.sf:
         if os.path.exists(args.sf):
diff --git a/pypeFLOW/pwatcher/network_based.py b/pypeFLOW/pwatcher/network_based.py
index 803431e..e16b185 100755
--- a/pypeFLOW/pwatcher/network_based.py
+++ b/pypeFLOW/pwatcher/network_based.py
@@ -241,7 +241,7 @@ def start_server(server_directories, hostname='', port=0):
     # set daemon to make sure server shuts down when main program finishes
     hb_thread.daemon = True
     hb_thread.start()
-    log.debug('server ({}, {}) alive?'.format(hostname, port, hb_thread.is_alive()))
+    log.debug('server ({}, {}) alive? {}'.format(hostname, port, hb_thread.is_alive()))
     return (hb_thread.authkey, (hostname, port))
 
 class MetaJobClass(object):
@@ -324,7 +324,7 @@ class State(object):
                     self.top['auth'], self.top['server'] = start_server(self.get_server_directories(), old_hostname, old_port)
                 except StandardError:
                     self.top['auth'], self.top['server'] = start_server(self.get_server_directories())
-                self__.changed = True
+                self.__changed = True
     # if we restarted, orphaned jobs might have left exit files
     # update the server with exit info
     def cleanup_exits(self):
@@ -335,9 +335,9 @@ class State(object):
                     rc = f.readline().strip()
                 hsocket = socket.socket()
                 hsocket.connect(self.get_heartbeat_server())
-                socket_send(hsocket, 'e {} {}'.format(jobid, rc))
+                #socket_send(hsocket, 'e {} {}'.format(jobid, rc)) #TODO: Must get jobid from somewhere
                 hsocket.close()
-                os.remove(fn)
+                os.remove(exit_fn)
         else:
             makedirs(self.get_directory_exits())
     def restore_from_save(self, state_fn):
@@ -488,11 +488,11 @@ class MetaJobLocal(object):
         hsocket = socket.socket()
         try:
             hsocket.connect(state.get_heartbeat_server())
-            socket_send(hsocket, 'P {}'.format(self.mj.job.jobid))
+            socket_send(hsocket, 'P {}'.format(self.mjob.job.jobid))
             line = socket_read(hsocket)
             hsocket.close()
         except IOError as e:
-            log.exception('Failed to get pig/pgid for {}: {!r}'.format(self.mj.job.jobid, e))
+            log.exception('Failed to get pig/pgid for {}: {!r}'.format(self.mjob.job.jobid, e))
             return
         args = line.split(None, 2)
         pid = int(args[0])
@@ -502,7 +502,7 @@ class MetaJobLocal(object):
         try:
             os.kill(-pgid, sig)
         except Exception:
-            log.exception('Failed to kill(%s) pgid=-%s for %r. Trying pid=%s' %(sig, pgid, self.mj.job.jobid, pid))
+            log.exception('Failed to kill(%s) pgid=-%s for %r. Trying pid=%s' %(sig, pgid, self.mjob.job.jobid, pid))
             os.kill(pid, sig)
     def __repr__(self):
         return 'MetaJobLocal(%s)' %repr(self.mjob)
@@ -617,6 +617,7 @@ class MetaJobTorque(object):
     def __init__(self, mjob):
         super(MetaJobTorque, self).__init__(mjob)
         self.specific = '-V' # pass enV; '-j oe' => combine out/err
+        self.mjob = mjob
 class MetaJobSlurm(object):
     def submit(self, state, exe, script_fn):
         """Can raise.
@@ -831,7 +832,7 @@ def delete_jobid(state, jobid, keep=False):
     try:
         bjob = state.get_bjob(jobid)
     except Exception:
-        log.exception('In delete_jobid(), unable to find batchjob for %' %(jobid))
+        log.exception('In delete_jobid(), unable to find batchjob for %s' %(jobid))
         # TODO: Maybe provide a default grid type, so we can attempt to delete anyway?
         return
     try:
diff --git a/pypeFLOW/pypeflow/do_task.py b/pypeFLOW/pypeflow/do_task.py
index 1366059..3898dfb 100644
--- a/pypeFLOW/pypeflow/do_task.py
+++ b/pypeFLOW/pypeflow/do_task.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python2.7
 from . import do_support, util
 import argparse
-import contextlib
 import importlib
 import inspect
 import json
@@ -58,29 +57,6 @@ def get_parser():
         help='JSON file, as per epilog.')
     return parser
 
- at contextlib.contextmanager
-def cd(newdir):
-    prevdir = os.getcwd()
-    LOG.debug('CD: %r <- %r' %(newdir, prevdir))
-    os.chdir(os.path.expanduser(newdir))
-    try:
-        yield
-    finally:
-        LOG.debug('CD: %r -> %r' %(newdir, prevdir))
-        os.chdir(prevdir)
-
-def mkdirs(path):
-    if not os.path.isdir(path):
-        cmd = 'mkdir -p {}'.format(path)
-        util.system(cmd)
-def rmdirs(path):
-    if os.path.isdir(path):
-        if len(path) < 20 and 'home' in path:
-            LOG.error('Refusing to rm {!r} since it might be your homedir.'.format(path))
-            return
-        cmd = 'rm -rf {}'.format(path)
-        util.system(cmd)
-
 def wait_for(fn):
     global TIMEOUT
     LOG.debug('Checking existence of {!r} with timeout={}'.format(fn, TIMEOUT))
@@ -132,7 +108,7 @@ def run(json_fn, timeout, tmpdir):
     cfg = json.loads(open(json_fn).read())
     LOG.debug(pprint.pformat(cfg))
     rundir = os.path.dirname(json_fn)
-    with cd(rundir):
+    with util.cd(rundir):
         run_cfg_in_tmpdir(cfg, tmpdir)
 def run_cfg_in_tmpdir(cfg, tmpdir):
     for fn in cfg['inputs'].values():
@@ -150,8 +126,8 @@ def run_cfg_in_tmpdir(cfg, tmpdir):
         user = getpass.getuser()
         pid = os.getpid()
         myrundir = '{tmpdir}/{user}/pypetmp/{finaloutdir}'.format(**locals())
-        rmdirs(myrundir)
-        mkdirs(myrundir)
+        util.rmdirs(myrundir)
+        util.mkdirs(myrundir)
         # TODO(CD): Copy inputs w/ flock.
     else:
         myrundir = finaloutdir
diff --git a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
index 5dc4c60..40d3c7e 100644
--- a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
+++ b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
@@ -534,7 +534,7 @@ class _PypeTask(object):
         for k,v in self.outputs.iteritems():
             assert os.path.isabs(v.path), 'For {!r}, output {!r} is not absolute'.format(self.wdir, v)
         common = set(self.inputs.keys()) & set(self.outputs.keys())
-        assert (not common), 'Keys in both inputs and outputs of PypeTask({}): {!r}'.format(wdir, common)
+        assert (not common), 'Keys in both inputs and outputs of PypeTask({}): {!r}'.format(self.wdir, common)
     def __call__(self, func):
         self.func = func
         self.func_name = '{}.{}'.format(func.__module__, func.__name__)
diff --git a/pypeFLOW/pypeflow/util.py b/pypeFLOW/pypeflow/util.py
index 1b3e13f..5d26de5 100644
--- a/pypeFLOW/pypeflow/util.py
+++ b/pypeFLOW/pypeflow/util.py
@@ -21,6 +21,13 @@ def run(script_fn):
 def mkdirs(path):
     if not os.path.isdir(path):
         os.makedirs(path)
+def rmdirs(path):
+    if os.path.isdir(path):
+        if len(path) < 20 and 'home' in path:
+            LOG.error('Refusing to rm {!r} since it might be your homedir.'.format(path))
+            return
+        cmd = 'rm -rf {}'.format(path)
+        system(cmd)
 def system(cmd):
     LOG.info(cmd)
     rc = os.system(cmd)
diff --git a/travis.sh b/travis.sh
index 439a78e..4e308a4 100755
--- a/travis.sh
+++ b/travis.sh
@@ -5,10 +5,17 @@
 set -vex
 
 #env | sort
-#sudo pip install virtualenv
 time date  # sanity check, since we use 'time' and 'date' in our scripts
-make init # Travis pulls submodules for us, but not --recursive
+
+#git submodule update --init
+# In Bamboo, we do not want this script to alter submodules,
+# since we do that inside Bamboo.
+
+# Note: Travis pulls submodules for us, but not --recursive. But we no longer need that.
+
+make init
 source env.sh
+#sudo pip install virtualenv # No! Prefer PYTHONUSERBASE.
 make config-edit-user
 make -j all
 make test

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falcon.git



More information about the debian-med-commit mailing list