[med-svn] [falcon] 04/15: New upstream version 1.8.8
Afif Elghraoui
afif at moszumanska.debian.org
Sat Dec 16 08:03:28 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository falcon.
commit 6fa9904c346391e416d9372ffd499e22ac9fa5f5
Author: Afif Elghraoui <afif at debian.org>
Date: Sat Dec 16 01:24:04 2017 -0500
New upstream version 1.8.8
---
DALIGNER/DBX.c | 85 ++
DALIGNER/DBX.h | 25 +
DALIGNER/GNUmakefile | 1 +
DALIGNER/LA4Falcon.c | 58 +-
DALIGNER/bamboo_build.sh | 28 +
DAZZ_DB/bamboo_build.sh | 23 +
FALCON-examples/git-sym.makefile | 6 +-
FALCON-examples/makefile | 2 +-
FALCON-examples/run/ecoli/fc_run.cfg | 8 +-
FALCON-examples/run/greg200k-sv2/fc_run.cfg | 13 +
FALCON-examples/run/greg200k-sv2/fc_unzip.cfg | 14 +-
FALCON-examples/run/greg200k-sv2/input.fofn | 4 +-
FALCON-examples/run/greg200k-sv2/input_bam.fofn | 2 +
.../run/synth0/{fc_run.cfg => fc_preads.cfg} | 26 +-
FALCON-examples/run/synth0/fc_run.cfg | 19 +-
FALCON-examples/run/synth0/makefile | 4 +-
FALCON-examples/run/synth0/preads.fofn | 2 +
FALCON/.travis.yml | 4 +-
FALCON/bamboo_build.sh | 28 +
FALCON/bamboo_test.sh | 14 +
FALCON/falcon_kit/FastaReader.py | 165 +--
FALCON/falcon_kit/__init__.py | 38 -
FALCON/falcon_kit/bash.py | 19 +-
FALCON/falcon_kit/falcon_kit.py | 47 +-
FALCON/falcon_kit/fc_asm_graph.py | 44 +-
FALCON/falcon_kit/functional.py | 22 +-
FALCON/falcon_kit/mains/actg_coordinate.py | 14 +-
FALCON/falcon_kit/mains/calc_cutoff.py | 20 +-
FALCON/falcon_kit/mains/dedup_a_tigs.py | 6 +-
FALCON/falcon_kit/mains/fetch_reads.py | 36 +-
FALCON/falcon_kit/mains/get_read_ctg_map.py | 3 +-
FALCON/falcon_kit/mains/graph_to_contig.py | 6 +-
FALCON/falcon_kit/mains/ovlp_filter.py | 8 +-
FALCON/falcon_kit/mains/ovlp_to_graph.py | 3 +-
FALCON/falcon_kit/mains/run1.py | 55 +-
FALCON/falcon_kit/pype_tasks.py | 4 +-
FALCON/falcon_kit/run_support.py | 49 +-
FALCON/falcon_kit/stats_preassembly.py | 17 +-
FALCON/falcon_kit/util/system.py | 9 +
FALCON/makefile | 65 ++
FALCON/mycoverage.cfg | 5 +
FALCON/mysitecustomize.py | 3 +
FALCON/setup.py | 1 +
FALCON/src/c/falcon.c | 13 +-
FALCON/src/py_scripts_v0.1/falcon_asm.py | 1154 ------------------
FALCON/src/py_scripts_v0.1/falcon_asm_s.py | 1220 --------------------
FALCON/src/py_scripts_v0.1/falcon_dedup.py | 119 --
FALCON/src/py_scripts_v0.1/falcon_fixasm.py | 213 ----
FALCON/src/py_scripts_v0.1/falcon_overlap.py | 328 ------
FALCON/src/py_scripts_v0.1/falcon_overlap2.py | 337 ------
FALCON/src/py_scripts_v0.1/falcon_qrm.py | 370 ------
FALCON/src/py_scripts_v0.1/falcon_qrm_0.py | 378 ------
FALCON/src/py_scripts_v0.1/falcon_sense.py | 248 ----
FALCON/src/py_scripts_v0.1/falcon_ucns_data.py | 120 --
FALCON/src/py_scripts_v0.1/falcon_utgcns.py | 124 --
FALCON/src/py_scripts_v0.1/get_ovl.sh | 7 -
FALCON/src/py_scripts_v0.1/get_rdata.py | 207 ----
FALCON/src/py_scripts_v0.1/overlapper.py | 216 ----
FALCON/src/py_scripts_v0.1/ovlp_filter.sh | 6 -
FALCON/src/py_scripts_v0.1/redis_graph.py | 79 --
FALCON/src/py_scripts_v0.1/remove_dup_ctg.py | 75 --
FALCON/test/helpers.py | 4 +
FALCON/test/test_calc_cutoff.py | 43 +
FALCON/test/test_functional.py | 14 +
FALCON/test/test_stats_preassembly.py | 17 +-
FALCON/test_data/calc_cutoff/partial_capture.txt | 5 +
FALCON/travis.sh | 17 +-
bamboo_build_and_test.sh | 42 +
makefile | 6 +-
pypeFLOW/bamboo_build.sh | 29 +
pypeFLOW/makefile | 3 +
pypeFLOW/pwatcher/blocking.py | 4 +-
pypeFLOW/pwatcher/fs_based.py | 54 +-
pypeFLOW/pwatcher/mains/pypeflow_example.py | 14 +-
pypeFLOW/pwatcher/mains/query_server.py | 3 +-
pypeFLOW/pwatcher/network_based.py | 17 +-
pypeFLOW/pypeflow/do_task.py | 30 +-
pypeFLOW/pypeflow/simple_pwatcher_bridge.py | 2 +-
pypeFLOW/pypeflow/util.py | 7 +
travis.sh | 11 +-
80 files changed, 873 insertions(+), 5668 deletions(-)
diff --git a/DALIGNER/DBX.c b/DALIGNER/DBX.c
new file mode 100644
index 0000000..2a84fbd
--- /dev/null
+++ b/DALIGNER/DBX.c
@@ -0,0 +1,85 @@
+#include "DBX.h"
+#include "DB.h"
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <assert.h>
+
+// From Jason, with 1 change
+static char* Load_Read_Data(HITS_DB *db) {
+ FILE *bases = (FILE*) db->bases;
+ struct stat sbuf;
+ char *data;
+
+ bases = fopen(Catenate(db->path,"","",".bps"),"r");
+ if (bases == NULL) EXIT(1);
+ stat(Catenate(db->path,"","",".bps"), &sbuf);
+ data = (char *) malloc(sbuf.st_size);
+ if (data == NULL) return NULL; // was EXIT(1), but we can proceed
+ fread(data, sbuf.st_size, 1, bases);
+ fclose(bases);
+ return(data);
+}
+
+// Wrapper
+int Open_DBX(char *path, HITS_DBX *dbx, bool preload) {
+ dbx->data = NULL;
+ int rc = Open_DB(path, &dbx->db);
+ switch (rc) {
+ case -1:
+ return -1;
+ case 0:
+ break;
+ case 1:
+ assert(rc != 1);
+ abort();
+ default:
+ assert(rc < -1 || rc > 1);
+ abort();
+ }
+ if (preload) {
+ dbx->data = Load_Read_Data(&dbx->db);
+ }
+ return 0;
+}
+
+// From Jason
+static int Load_Read_From_RAM(HITS_DB *db, char *data, int i, char *read, int ascii) {
+ int64 off;
+ int len, clen;
+ HITS_READ *r = db->reads;
+
+ if (i >= db->nreads) { EXIT(1); }
+
+ off = r[i].boff;
+ len = r[i].rlen;
+ clen = COMPRESSED_LEN(len);
+ if (clen > 0) { memcpy(read, data + off, clen); } //fread(read,clen,1,bases)
+ Uncompress_Read(len, read);
+ if (ascii == 1)
+ { Lower_Read(read);
+ read[-1] = '\0';
+ }
+ else if (ascii == 2)
+ { Upper_Read(read);
+ read[-1] = '\0';
+ }
+ else
+ read[-1] = 4;
+ return (0);
+}
+
+// Wrapper
+int Load_ReadX(HITS_DBX *dbx, int i, char *read, int ascii) {
+ if (dbx->data) {
+ return Load_Read_From_RAM(&dbx->db, dbx->data, i, read, ascii);
+ } else {
+ return Load_Read(&dbx->db, i, read, ascii);
+ }
+}
+
+// Wrapper
+void Close_DBX(HITS_DBX *dbx) {
+ Close_DB(&dbx->db);
+ if (dbx->data) free(dbx->data);
+}
diff --git a/DALIGNER/DBX.h b/DALIGNER/DBX.h
new file mode 100644
index 0000000..8fd9ace
--- /dev/null
+++ b/DALIGNER/DBX.h
@@ -0,0 +1,25 @@
+#ifndef DALIGNER_DBX_H
+#define DALIGNER_DBX_H
+/* Wrappers to extend HITS_DB.
+ *
+ * Note that none of the extra fields are ever stored on-disk.
+ */
+#include "DB.h"
+#include <stdbool.h>
+
+typedef struct {
+ HITS_DB db;
+/*
+ * When "data" is non-null, it stores the entire DB
+ * in memory, so we can avoid random-access disk operations.
+ * But if null, then wrappers simply delegate.
+ */
+ char* data;
+} HITS_DBX;
+
+int Open_DBX(char *path, HITS_DBX *dbx, bool preload);
+int Load_ReadX(HITS_DBX *dbx, int i, char *read, int ascii);
+//void Trim_DB(HITS_DBX *dbx);
+void Close_DBX(HITS_DBX *dbx);
+
+#endif
diff --git a/DALIGNER/GNUmakefile b/DALIGNER/GNUmakefile
index d0ceb2f..3d81c36 100644
--- a/DALIGNER/GNUmakefile
+++ b/DALIGNER/GNUmakefile
@@ -15,6 +15,7 @@ vpath %.a ${THISDIR}/../DAZZ_DB
all: ${ALL}
daligner: filter.o
daligner_p: filter_p.o
+LA4Falcon: DBX.o
${ALL}: align.o
install:
diff --git a/DALIGNER/LA4Falcon.c b/DALIGNER/LA4Falcon.c
index 2ee07a6..ef642b7 100644
--- a/DALIGNER/LA4Falcon.c
+++ b/DALIGNER/LA4Falcon.c
@@ -1,3 +1,4 @@
+/* vim: set et ts=2 sts=2 sw=2 : */
/************************************************************************************\
* *
* Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. *
@@ -54,18 +55,13 @@
* Last Mod: July 2015
*
*******************************************************************************************/
+#include "DB.h"
+#include "DBX.h"
+#include "align.h"
+#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <stdint.h>
-#include <ctype.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "DB.h"
-#include "align.h"
#define MAX_OVERLAPS 50000
@@ -144,12 +140,13 @@ static bool add_overlap(const Alignment *aln, const Overlap *ovl, const int coun
return added;
}
-static void print_hits(const int hit_count, HITS_DB *db2, char *bbuffer, char buffer[], int64 bsize, const int MAX_HIT_COUNT) {
+static void print_hits(const int hit_count, HITS_DBX *dbx2, char *bbuffer, char buffer[], int64 bsize, const int MAX_HIT_COUNT) {
int tmp_idx;
qsort(ovlgrps, (hit_count+1), sizeof(OverlapGroup), compare_ovlgrps);
for (tmp_idx = 0; tmp_idx < (hit_count+1) && tmp_idx < MAX_HIT_COUNT; tmp_idx++) {
OverlapGroup *grp = &ovlgrps[tmp_idx];
- Load_Read(db2, grp->end.bread, bbuffer, 0);
+ //Load_ReadX assuming db2 == db1 is true
+ Load_ReadX(dbx2, grp->end.bread, bbuffer, 0);
if (COMP(grp->end.flags)) Complement_Seq(bbuffer, grp->blen );
Upper_Read(bbuffer);
int64 const rlen = (int64)(grp->end.path.bepos) - (int64)(grp->beg.path.bbpos);
@@ -178,8 +175,10 @@ static int ORDER(const void *l, const void *r)
}
int main(int argc, char *argv[])
-{ HITS_DB _db1, *db1 = &_db1;
- HITS_DB _db2, *db2 = &_db2;
+{ HITS_DBX _dbx1, *dbx1 = &_dbx1;
+ HITS_DBX _dbx2, *dbx2 = &_dbx2;
+ HITS_DB *db1 = &dbx1->db;
+ HITS_DB *db2 = &dbx2->db;
Overlap _ovl, *ovl = &_ovl;
Alignment _aln, *aln = &_aln;
@@ -196,6 +195,7 @@ int main(int argc, char *argv[])
int FALCON, OVERLAP, M4OVL;
// XXX: MAX_HIT_COUNT should be renamed
int SEED_MIN, MAX_HIT_COUNT, SKIP;
+ int PRELOAD;
// Process options
@@ -225,7 +225,7 @@ int main(int argc, char *argv[])
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
- ARG_FLAGS("smfocargUFM")
+ ARG_FLAGS("smfocargUFMP")
break;
case 'i':
ARG_NON_NEGATIVE(INDENT,"Indent")
@@ -259,6 +259,7 @@ int main(int argc, char *argv[])
FALCON = flags['f'];
SKIP = flags['s'];
GROUP = flags['g'];
+ PRELOAD = flags['P']; // Preload DB reads, if possible.
if (argc <= 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
@@ -274,7 +275,7 @@ int main(int argc, char *argv[])
FILE *input;
ISTWO = 0;
- status = Open_DB(argv[1],db1);
+ status = Open_DBX(argv[1],dbx1,PRELOAD);
if (status < 0)
exit (1);
if (db1->part > 0)
@@ -288,7 +289,7 @@ int main(int argc, char *argv[])
if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL)
{ ISTWO = 1;
fclose(input);
- status = Open_DB(argv[2],db2);
+ status = Open_DBX(argv[2],dbx2,PRELOAD);
if (status < 0)
exit (1);
if (db2->part > 0)
@@ -298,12 +299,16 @@ int main(int argc, char *argv[])
Trim_DB(db2);
}
else
- db2 = db1;
+ { dbx2 = dbx1;
+ db2 = db1;
+ }
free(root);
free(pwd);
}
else
- db2 = db1;
+ { dbx2 = dbx1;
+ db2 = db1;
+ }
Trim_DB(db1);
}
@@ -697,16 +702,16 @@ int main(int argc, char *argv[])
if (FALCON)
{
if (p_aread == -1) {
- Load_Read(db1, ovl->aread, abuffer, 2);
+ Load_ReadX(dbx1, ovl->aread, abuffer, 2);
printf("%08d %s\n", ovl->aread, abuffer);
p_aread = ovl->aread;
skip_rest = 0;
}
if (p_aread != ovl -> aread ) {
- print_hits(hit_count, db2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
+ print_hits(hit_count, dbx2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
hit_count = -1;
- Load_Read(db1, ovl->aread, abuffer, 2);
+ Load_ReadX(dbx1, ovl->aread, abuffer, 2);
printf("%08d %s\n", ovl->aread, abuffer);
p_aread = ovl->aread;
skip_rest = 0;
@@ -725,8 +730,8 @@ int main(int argc, char *argv[])
tps = ((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace);
if (small)
Decompress_TraceTo16(ovl);
- Load_Read(db1, ovl->aread, abuffer, 0);
- Load_Read(db2, ovl->bread, bbuffer, 0);
+ Load_ReadX(dbx1, ovl->aread, abuffer, 0);
+ Load_ReadX(dbx2, ovl->bread, bbuffer, 0);
if (COMP(aln->flags))
Complement_Seq(bbuffer, aln->blen);
Compute_Trace_PTS(aln,work,tspace);
@@ -828,7 +833,7 @@ int main(int argc, char *argv[])
if (FALCON && hit_count != -1)
{
- print_hits(hit_count, db2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
+ print_hits(hit_count, dbx2, bbuffer, buffer, (int64)sizeof(buffer), MAX_HIT_COUNT);
printf("- -\n");
free(ovlgrps);
}
@@ -842,9 +847,8 @@ int main(int argc, char *argv[])
}
}
- Close_DB(db1);
+ Close_DBX(dbx1);
if (ISTWO)
- Close_DB(db2);
-
+ Close_DBX(dbx2);
exit (0);
}
diff --git a/DALIGNER/bamboo_build.sh b/DALIGNER/bamboo_build.sh
new file mode 100644
index 0000000..b9193e5
--- /dev/null
+++ b/DALIGNER/bamboo_build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -ex
+type module >& /dev/null || source /mnt/software/Modules/current/init/bash
+
+set -vex
+module load gcc/4.9.2
+module load git/2.8.3
+module load ccache
+NEXUS_BASEURL=http://ossnexus.pacificbiosciences.com/repository
+NEXUS_URL=$NEXUS_BASEURL/unsupported/gcc-4.9.2
+
+rm -rf prebuilt build
+mkdir -p prebuilt/DAZZ_DB build/bin
+curl -s -L $NEXUS_URL/DAZZ_DB-SNAPSHOT.tgz|tar zxf - -C prebuilt/DAZZ_DB
+mkdir -p DAZZ_DB
+cp prebuilt/DAZZ_DB/lib/*.a DAZZ_DB/
+cp prebuilt/DAZZ_DB/include/*.h DAZZ_DB/
+
+make -C DALIGNER clean
+make -C DALIGNER LIBDIRS=$PWD/prebuilt/DAZZ_DB/lib -j
+make -C DALIGNER PREFIX=$PWD/build install
+
+make -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile clean
+PATH=.:${PATH} make -C DALIGNER -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile
+make -f /dept/secondary/siv/testdata/hgap/synth5k/LA4Falcon/makefile clean
+
+cd build
+tar zcf DALIGNER-SNAPSHOT.tgz bin
+curl -v -n --upload-file DALIGNER-SNAPSHOT.tgz $NEXUS_URL/DALIGNER-SNAPSHOT.tgz
diff --git a/DAZZ_DB/bamboo_build.sh b/DAZZ_DB/bamboo_build.sh
new file mode 100644
index 0000000..5748fa4
--- /dev/null
+++ b/DAZZ_DB/bamboo_build.sh
@@ -0,0 +1,23 @@
+#!/bin/bash -xe
+type module >& /dev/null || source /mnt/software/Modules/current/init/bash
+
+set -vex
+
+module load gcc/4.9.2
+module load git/2.8.3
+module load ccache
+
+rm -rf build
+mkdir -p build/lib build/bin build/include
+cd DAZZ_DB
+make clean
+make -j
+make PREFIX=$PWD/../build install
+cp *.h ../build/include
+cd -
+cd build
+tar zcf DAZZ_DB-SNAPSHOT.tgz bin lib include
+NEXUS_BASEURL=http://ossnexus.pacificbiosciences.com/repository
+NEXUS_URL=$NEXUS_BASEURL/unsupported/gcc-4.9.2
+curl -v -n --upload-file DAZZ_DB-SNAPSHOT.tgz $NEXUS_URL/DAZZ_DB-SNAPSHOT.tgz
+cd -
diff --git a/FALCON-examples/git-sym.makefile b/FALCON-examples/git-sym.makefile
index ed9b703..c7b7fb5 100644
--- a/FALCON-examples/git-sym.makefile
+++ b/FALCON-examples/git-sym.makefile
@@ -24,8 +24,8 @@ synth0.ref.fasta:
arab-creads.fasta:
cp -f /lustre/hpcprod/cdunn/data/arab_test/corrected.fasta $@
synth5k.2016-11-02:
- curl -L https://downloads.pacbcloud.com/public/data/git-sym/synth5k.2016-11-02.tgz | tar xvfz -
+ curl -kL https://downloads.pacbcloud.com/public/data/git-sym/synth5k.2016-11-02.tgz | tar xvfz -
ecoli.m140913_050931_42139_c100713652400000001823152404301535_s1_p0:
curl -L https://downloads.pacbcloud.com/public/data/git-sym/ecoli.m140913_050931_42139_c100713652400000001823152404301535_s1_p0.subreads.tar | tar xvf -
-greg200k-sv2:
- curl -L https://downloads.pacbcloud.com/public/data/git-sym/greg200k-sv2.tar | tar xvf -
+greg200k-sv2.2:
+ curl -L https://downloads.pacbcloud.com/public/data/git-sym/greg200k-sv2.2.tar | tar xvf -
diff --git a/FALCON-examples/makefile b/FALCON-examples/makefile
index d600c35..948eb23 100644
--- a/FALCON-examples/makefile
+++ b/FALCON-examples/makefile
@@ -1,7 +1,7 @@
default:
@echo 'Try "make run-foo" for any sub-dir of run/.'
run-%: setup-%
- cd run/$*; fc_run.py fc_run.cfg logging.ini
+ cd run/$*; rm -rf 0-rawreads/ 1-preads_ovl/ 2-asm-falcon/; ls -l; fc_run.py fc_run.cfg logging.ini
setup-%:
git-sym update run/$*
git-sym show run/$*
diff --git a/FALCON-examples/run/ecoli/fc_run.cfg b/FALCON-examples/run/ecoli/fc_run.cfg
index c2d5682..12a24ce 100644
--- a/FALCON-examples/run/ecoli/fc_run.cfg
+++ b/FALCON-examples/run/ecoli/fc_run.cfg
@@ -1,5 +1,9 @@
[General]
-#job_type = local
+use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD}
+job_queue = bash -C ${CMD} > ${STDOUT_FILE} 2> ${STDERR_FILE}
# list of files of the initial bas.h5 files
input_fofn = input.fofn
@@ -25,6 +29,8 @@ sge_option_cns = -pe smp 8 -q %(jobqueue)s
pa_concurrent_jobs = 32
ovlp_concurrent_jobs = 32
+pa_concurrent_jobs = 6
+ovlp_concurrent_jobs = 6
pa_HPCdaligner_option = -v -B4 -t16 -e.70 -l1000 -s1000
ovlp_HPCdaligner_option = -v -B4 -t32 -h60 -e.96 -l500 -s1000
diff --git a/FALCON-examples/run/greg200k-sv2/fc_run.cfg b/FALCON-examples/run/greg200k-sv2/fc_run.cfg
index a8fb8c4..91c1e1c 100755
--- a/FALCON-examples/run/greg200k-sv2/fc_run.cfg
+++ b/FALCON-examples/run/greg200k-sv2/fc_run.cfg
@@ -5,9 +5,22 @@ input_fofn = input.fofn
job_type = local
+use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD}
+
input_type = raw
#input_type = preads
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
#openending = True
# The length cutoff used for seed reads used for initial mapping
diff --git a/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg b/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg
index c9de3af..2b6f9cd 100644
--- a/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg
+++ b/FALCON-examples/run/greg200k-sv2/fc_unzip.cfg
@@ -2,13 +2,25 @@
job_type = SGE
job_type = local
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+#job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
+max_n_open_files = 1000
+
[Unzip]
input_fofn= input.fofn
input_bam_fofn= input_bam.fofn
#smrt_bin= /mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/
#smrt_bin=/mnt/secondary/builds/full/3.0.1/prod/current-build_smrtanalysis/smrtcmds/bin/
-smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
+#smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
+#smrt_bin=/mnt/secondary/builds/full/3.2.0/prod/current-build_smrttools-incremental/smrtcmds/bin/ # does not work
+smrt_bin=/pbi/dept/secondary/builds/4.1.0/current_smrttools_incremental_installdir/smrtcmds/bin
sge_phasing= -pe smp 12 -q bigmem
sge_quiver= -pe smp 12 -q sequel-farm
sge_track_reads= -pe smp 12 -q default
diff --git a/FALCON-examples/run/greg200k-sv2/input.fofn b/FALCON-examples/run/greg200k-sv2/input.fofn
index 99cce2d..84a7c53 100644
--- a/FALCON-examples/run/greg200k-sv2/input.fofn
+++ b/FALCON-examples/run/greg200k-sv2/input.fofn
@@ -1,2 +1,2 @@
-data/greg200k-sv2/subreads1.dexta
-data/greg200k-sv2/subreads2.dexta
+data/greg200k-sv2/subreads1.fasta
+data/greg200k-sv2/subreads2.fasta
diff --git a/FALCON-examples/run/greg200k-sv2/input_bam.fofn b/FALCON-examples/run/greg200k-sv2/input_bam.fofn
new file mode 100644
index 0000000..af301f1
--- /dev/null
+++ b/FALCON-examples/run/greg200k-sv2/input_bam.fofn
@@ -0,0 +1,2 @@
+data/greg200k-sv2/subreads1.bam
+data/greg200k-sv2/subreads2.bam
diff --git a/FALCON-examples/run/synth0/fc_run.cfg b/FALCON-examples/run/synth0/fc_preads.cfg
similarity index 71%
copy from FALCON-examples/run/synth0/fc_run.cfg
copy to FALCON-examples/run/synth0/fc_preads.cfg
index 0740760..2d92250 100644
--- a/FALCON-examples/run/synth0/fc_run.cfg
+++ b/FALCON-examples/run/synth0/fc_preads.cfg
@@ -1,15 +1,24 @@
[General]
-use_tmpdir = true
-job_type = local
+#use_tmpdir = true
+#job_type = local
#job_type = sge
-#stop_all_jobs_on_failure = true
+stop_all_jobs_on_failure = true
+
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
# list of files of the initial bas.h5 files
-input_fofn = input.fofn
-#input_fofn = preads.fofn
+#input_fofn = input.fofn
+input_fofn = preads.fofn
-input_type = raw
-#input_type = preads
+#input_type = raw
+input_type = preads
# The length cutoff used for seed reads used for initial mapping
#length_cutoff = 1
@@ -20,7 +29,7 @@ seed_coverage = 20
length_cutoff_pr = 1
-job_queue = production
+#job_queue = production
sge_option_da = -pe smp 8 -q %(job_queue)s
sge_option_la = -pe smp 2 -q %(job_queue)s
sge_option_pda = -pe smp 8 -q %(job_queue)s
@@ -40,6 +49,7 @@ pa_DBsplit_option = -a -x5 -s.065536
#pa_DBsplit_option = -a -x5 -s1
ovlp_DBsplit_option = -a -x5 -s50
+LA4Falcon_preload = true
falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 1 --max_n_read 20000 --n_core 0
#--min_cov_aln 1 --min_len_aln 40
diff --git a/FALCON-examples/run/synth0/fc_run.cfg b/FALCON-examples/run/synth0/fc_run.cfg
index 0740760..6de3553 100644
--- a/FALCON-examples/run/synth0/fc_run.cfg
+++ b/FALCON-examples/run/synth0/fc_run.cfg
@@ -1,8 +1,18 @@
[General]
-use_tmpdir = true
-job_type = local
+#use_tmpdir = true
+#job_type = local
#job_type = sge
-#stop_all_jobs_on_failure = true
+stop_all_jobs_on_failure = true
+
+#skip_checks = true
+#use_tmpdir = /scratch
+pwatcher_type = blocking
+job_type = string
+job_queue = bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}
+job_queue = bash -C ${CMD}
+# By dropping STD*_FILE, we see all output on the console.
+# That helps debugging in TravisCI/Bamboo.
+
# list of files of the initial bas.h5 files
input_fofn = input.fofn
@@ -20,7 +30,7 @@ seed_coverage = 20
length_cutoff_pr = 1
-job_queue = production
+#job_queue = production
sge_option_da = -pe smp 8 -q %(job_queue)s
sge_option_la = -pe smp 2 -q %(job_queue)s
sge_option_pda = -pe smp 8 -q %(job_queue)s
@@ -40,6 +50,7 @@ pa_DBsplit_option = -a -x5 -s.065536
#pa_DBsplit_option = -a -x5 -s1
ovlp_DBsplit_option = -a -x5 -s50
+LA4Falcon_preload = true
falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 1 --max_n_read 20000 --n_core 0
#--min_cov_aln 1 --min_len_aln 40
diff --git a/FALCON-examples/run/synth0/makefile b/FALCON-examples/run/synth0/makefile
index e7ef88e..14ca0ea 100644
--- a/FALCON-examples/run/synth0/makefile
+++ b/FALCON-examples/run/synth0/makefile
@@ -1,10 +1,12 @@
# This will show 'shift by 273', but we do not mind if the shift changes,
# since it is circular. We just want output to match input with some shift,
# and maybe with reverse-complement.
+FC_CFG?=fc_run.cfg
+
go: run
${MAKE} test
run:
- fc_run fc_run.cfg logging.json
+ fc_run ${FC_CFG} logging.json
test:
./check.py
clean:
diff --git a/FALCON-examples/run/synth0/preads.fofn b/FALCON-examples/run/synth0/preads.fofn
new file mode 100644
index 0000000..63753e6
--- /dev/null
+++ b/FALCON-examples/run/synth0/preads.fofn
@@ -0,0 +1,2 @@
+data/preads/cns_00001.fasta
+data/preads/cns_00002.fasta
diff --git a/FALCON/.travis.yml b/FALCON/.travis.yml
index dd67f33..baa1e88 100644
--- a/FALCON/.travis.yml
+++ b/FALCON/.travis.yml
@@ -12,7 +12,9 @@
#sudo: required
os:
- linux
-language: python
+#language: python # This seems to cause virtualenv, which we do not want. We prefer a --user install.
+# But to speed-up start-up,
+language: c
compiler:
- clang # hmm. distutils uses 'gcc' anyway
# - gcc
diff --git a/FALCON/bamboo_build.sh b/FALCON/bamboo_build.sh
new file mode 100644
index 0000000..f003289
--- /dev/null
+++ b/FALCON/bamboo_build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+module unload git gcc ccache
+module load git/2.8.3
+module load gcc/4.9.2
+module load ccache/3.2.3
+#module load make
+
+set -vx
+git --version
+which gcc
+which g++
+gcc --version
+# We cannot use /bin/python without /bin/gcc.
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+
+make install-edit
+# Note: no --edit because we might be building artifacts.
+# ... Scratch that. We have trouble getting coverage for
+# source=falcon_kit
+# but maybe it will work with a --edit install.
+
+make pylint
diff --git a/FALCON/bamboo_test.sh b/FALCON/bamboo_test.sh
new file mode 100644
index 0000000..e8876be
--- /dev/null
+++ b/FALCON/bamboo_test.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+set -vex
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+export PATH=$PYTHONUSERBASE/bin:$PATH
+
+pip install --user pytest coverage
+#make test
+make coverage-install
+make coverage
+chmod -R ugo+rwx .
diff --git a/FALCON/falcon_kit/FastaReader.py b/FALCON/falcon_kit/FastaReader.py
index e1a7780..81c8d8c 100644
--- a/FALCON/falcon_kit/FastaReader.py
+++ b/FALCON/falcon_kit/FastaReader.py
@@ -1,44 +1,18 @@
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
from os.path import abspath, expanduser
from cStringIO import StringIO
+import contextlib
+import gzip
import md5
import re
+import subprocess
+
+##
+## Utility functions for FastaReader
+##
+def wrap(s, columns):
+ return "\n".join(s[start:start+columns]
+ for start in xrange(0, len(s), columns))
+
def splitFastaHeader( name ):
"""
@@ -70,51 +44,6 @@ def splitFileContents(f, delimiter, BLOCKSIZE=8192):
remainder.write(part)
yield remainder.getvalue()
-def isFileLikeObject(o):
- return hasattr(o, "read") and hasattr(o, "write")
-
-def getFileHandle(filenameOrFile, mode="r"):
- """
- Given a filename not ending in ".gz", open the file with the
- appropriate mode.
- Given a filename ending in ".gz", return a filehandle to the
- unzipped stream.
- Given a file object, return it unless the mode is incorrect--in
- that case, raise an exception.
- """
- assert mode in ("r", "w")
-
- if isinstance(filenameOrFile, basestring):
- filename = abspath(expanduser(filenameOrFile))
- if filename.endswith(".gz"):
- return gzip.open(filename, mode)
- else:
- return open(filename, mode)
- elif isFileLikeObject(filenameOrFile):
- return filenameOrFile
- else:
- raise Exception("Invalid type to getFileHandle")
-
-
-class ReaderBase(object):
- def __init__(self, f):
- """
- Prepare for iteration through the records in the file
- """
- self.file = getFileHandle(f, "r")
-
- def close(self):
- """
- Close the underlying file
- """
- self.file.close()
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
-
class FastaRecord(object):
"""
@@ -198,18 +127,6 @@ class FastaRecord(object):
except AssertionError:
raise ValueError("String not recognized as a valid FASTA record")
- def reverseComplement(self, preserveHeader=False):
- """
- Return a new FastaRecord with the reverse-complemented DNA sequence.
- Optionally, supply a name
- """
- rcSequence = sequences.reverseComplement(self.sequence)
- if preserveHeader:
- return FastaRecord(self.name, rcSequence)
- else:
- rcName = '{0} [revcomp]'.format(self.name.strip())
- return FastaRecord(rcName, rcSequence)
-
def __eq__(self, other):
if isinstance(other, self.__class__):
return (self.name == other.name and
@@ -229,8 +146,34 @@ class FastaRecord(object):
wrap(self.sequence, self.COLUMNS)
-class FastaReader(ReaderBase):
+# These are refactored from ReaderBase/FastaReader.
+
+def yield_fasta_records(f, fn):
+ """
+ f: fileobj
+ fn: str - filename (for exceptions)
"""
+ try:
+ parts = splitFileContents(f, ">")
+ assert "" == next(parts)
+ for part in parts:
+ yield FastaRecord.fromString(">" + part)
+ except AssertionError:
+ raise Exception("Invalid FASTA file {!r}".format(fn))
+
+
+def stream_stdout(call, fn):
+ args = call.split()
+ proc = subprocess.Popen(args, stdin=open(fn), stdout=subprocess.PIPE)
+ return proc.stdout
+
+ at contextlib.contextmanager
+def open_fasta_reader(fn):
+ """
+ fn: str - filename
+
+ Note: If you already have a fileobj, you can iterate over yield_fasta_records() directly.
+
Streaming reader for FASTA files, useable as a one-shot iterator
over FastaRecord objects. Agnostic about line wrapping.
Example:
@@ -239,22 +182,32 @@ class FastaReader(ReaderBase):
> from pbcore import data
> filename = data.getTinyFasta()
> r = FastaReader(filename)
- > for record in r:
+ > with open_fasta_reader(filename) as r:
+ ... for record in r:
... print record.name, len(record.sequence), record.md5
ref000001|EGFR_Exon_2 183 e3912e9ceacd6538ede8c1b2adda7423
ref000002|EGFR_Exon_3 203 4bf218da37175a91869033024ac8f9e9
ref000003|EGFR_Exon_4 215 245bc7a046aad0788c22b071ed210f4d
ref000004|EGFR_Exon_5 157 c368b8191164a9d6ab76fd328e2803ca
- > r.close()
"""
- DELIMITER = ">"
+ filename = abspath(expanduser(fn))
+ mode = 'r'
+ if filename.endswith(".gz"):
+ ofs = gzip.open(filename, mode)
+ elif filename.endswith(".dexta"):
+ ofs = stream_stdout("undexta -vkU -w60 -i", filename)
+ else:
+ ofs = open(filename, mode)
+ yield yield_fasta_records(ofs, filename)
+ ofs.close()
- def __iter__(self):
- try:
- parts = splitFileContents(self.file, ">")
- assert "" == next(parts)
- for part in parts:
- yield FastaRecord.fromString(">" + part)
- except AssertionError:
- raise ValueError("Invalid FASTA file")
+class FastaReader(object):
+ """Deprecated, but should still work (with filenames).
+ """
+ def __iter__(self):
+ with open_fasta_reader(self.filename) as reader:
+ for rec in reader:
+ yield rec
+ def __init__(self, f):
+ self.filename = f
diff --git a/FALCON/falcon_kit/__init__.py b/FALCON/falcon_kit/__init__.py
index 2e1685f..df872a8 100644
--- a/FALCON/falcon_kit/__init__.py
+++ b/FALCON/falcon_kit/__init__.py
@@ -1,39 +1 @@
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
from .falcon_kit import *
diff --git a/FALCON/falcon_kit/bash.py b/FALCON/falcon_kit/bash.py
index e0880e8..df34cdb 100644
--- a/FALCON/falcon_kit/bash.py
+++ b/FALCON/falcon_kit/bash.py
@@ -194,6 +194,7 @@ def script_build_rdb(config, input_fofn_fn, run_jobs_bfn):
mdust = ''
params.update(locals())
script = """\
+echo "PBFALCON_ERRFILE=$PBFALCON_ERRFILE"
set -o pipefail
#fc_fasta2fasta < {input_fofn_fn} >| fc.fofn
while read fn; do {cat_fasta} $fn | fasta2DB -v raw_reads -i${{fn##*/}}; done < {input_fofn_fn}
@@ -307,8 +308,9 @@ rmfollow() {
if not line.startswith('LAmerge'):
continue
las_files = [word + '.las' for word in functional.yield_args_from_line(line)]
- assert las_fn == os.path.basename(las_files[0])
- script.extend('rmfollow {}'.format(fn) for fn in las_files[1:])
+ #las_fn = os.path.basename(las_files[0])
+ #assert las_fn == os.path.basename(las_files[0])
+ script.extend('# rmfollow {}'.format(fn) for fn in las_files[1:])
break
content = bash_funcs + '\n'.join(script + [''])
@@ -327,13 +329,16 @@ def script_run_consensus(config, db_fn, las_fn, out_file_bfn):
else:
bash_cutoff = '{}'.format(length_cutoff)
params.update(locals())
+ LA4Falcon_flags = 'P' if params.get('LA4Falcon_preload') else ''
if config["falcon_sense_skip_contained"]:
- run_consensus = """LA4Falcon -H$CUTOFF -fso {db_fn} {las_fn} | """
+ LA4Falcon_flags += 'fso'
elif config["falcon_sense_greedy"]:
- run_consensus = """LA4Falcon -H$CUTOFF -fog {db_fn} {las_fn} | """
+ LA4Falcon_flags += 'fog'
else:
- run_consensus = """LA4Falcon -H$CUTOFF -fo {db_fn} {las_fn} | """
- run_consensus += """fc_consensus {falcon_sense_option} >| {out_file_bfn}"""
+ LA4Falcon_flags += 'fo'
+ if LA4Falcon_flags:
+ LA4Falcon_flags = '-' + ''.join(set(LA4Falcon_flags))
+ run_consensus = "LA4Falcon -H$CUTOFF %s {db_fn} {las_fn} | fc_consensus {falcon_sense_option} >| {out_file_bfn}"%LA4Falcon_flags
if config.get('dazcon', False):
run_consensus = """
@@ -378,6 +383,6 @@ def script_run_report_pre_assembly(i_raw_reads_db_fn, i_preads_fofn_fn, genome_l
params = dict()
params.update(locals())
script = """\
-python -m falcon_kit.mains.report_pre_assembly --genome-length {genome_length} --length-cutoff {length_cutoff} --db {i_raw_reads_db_fn} --preads-fofn {i_preads_fofn_fn} --out {o_json_fn}
+python2.7 -m falcon_kit.mains.report_pre_assembly --genome-length {genome_length} --length-cutoff {length_cutoff} --db {i_raw_reads_db_fn} --preads-fofn {i_preads_fofn_fn} --out {o_json_fn}
"""
return script.format(**params)
diff --git a/FALCON/falcon_kit/falcon_kit.py b/FALCON/falcon_kit/falcon_kit.py
index 0d01659..e87e2d6 100644
--- a/FALCON/falcon_kit/falcon_kit.py
+++ b/FALCON/falcon_kit/falcon_kit.py
@@ -1,40 +1,4 @@
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
+from __future__ import absolute_import
__all__ = [
'kup', 'DWA', 'falcon',
'KmerLookup', 'KmerMatch', 'AlnRange', 'ConsensusData',
@@ -42,6 +6,7 @@ __all__ = [
]
from ctypes import *
+import os
import ext_falcon
#module_path = os.path.split(__file__)[0]
@@ -70,8 +35,12 @@ class ConsensusData(Structure):
_fields_ = [ ("sequence", c_char_p),
("eff_cov", POINTER(c_uint)) ]
-
-falcon_dll = CDLL(ext_falcon.__file__)
+try:
+ falcon_dll = CDLL(ext_falcon.__file__)
+except OSError:
+ # It seems that setup.py has changed the __file__ it attaches to an extension module.
+ # I have no idea why or why, but this works around it.
+ falcon_dll = CDLL(os.path.join(os.path.dirname(__file__), '..', os.path.basename(ext_falcon.__file__)))
kup = falcon_dll
diff --git a/FALCON/falcon_kit/fc_asm_graph.py b/FALCON/falcon_kit/fc_asm_graph.py
index 1f0e376..3cc7984 100644
--- a/FALCON/falcon_kit/fc_asm_graph.py
+++ b/FALCON/falcon_kit/fc_asm_graph.py
@@ -1,42 +1,6 @@
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
+from __future__ import absolute_import
+from .FastaReader import open_fasta_reader
import networkx as nx
-from FastaReader import FastaReader
RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
@@ -90,8 +54,8 @@ class AsmGraph(object):
seqs = {}
# load all p-read name into memory
- f = FastaReader(fasta_fn)
- for r in f:
+ with open_fasta_reader(fasta_fn) as f:
+ for r in f:
if r.name not in all_read_ids:
continue
seqs[r.name] = r.sequence.upper()
diff --git a/FALCON/falcon_kit/functional.py b/FALCON/falcon_kit/functional.py
index b9495d5..04ba8de 100644
--- a/FALCON/falcon_kit/functional.py
+++ b/FALCON/falcon_kit/functional.py
@@ -6,7 +6,7 @@ import re
import StringIO
def _verify_pairs(pairs1, pairs2):
- if pairs1 != pairs2:
+ if pairs1 != pairs2: # pragma: no cover
print('pair2dali:', pairs1)
print('pair2sort:', pairs2)
print('dali-sort:', set(pairs1) - set(pairs2))
@@ -34,8 +34,8 @@ def get_daligner_job_descriptions_sans_LAcheck(run_jobs_stream, db_prefix, singl
result = {}
for k,v in descs.iteritems():
bash = skip_LAcheck(v)
- bash = bash.replace('LAsort', 'python -m falcon_kit.mains.LAsort {}'.format(db_prefix))
- bash = bash.replace('LAmerge', 'python -m falcon_kit.mains.LAmerge {}'.format(db_prefix))
+ bash = bash.replace('LAsort', 'python2.7 -m falcon_kit.mains.LAsort {}'.format(db_prefix))
+ bash = bash.replace('LAmerge', 'python2.7 -m falcon_kit.mains.LAmerge {}'.format(db_prefix))
result[k] = bash
return result
@@ -79,7 +79,7 @@ def get_daligner_job_descriptions(run_jobs_stream, db_prefix, single=False):
Can return [('', '')] if only 1 block.
"""
mo = re_pair_sort.search(line)
- if not mo:
+ if not mo: # pragma: no cover
raise Exception('Pattern {!r} does not match line {!r}'.format(
re_pair_sort.pattern, line))
return mo.group(1, 2)
@@ -177,7 +177,7 @@ def get_las_filenames(mjob_data, db_prefix):
mo = regex.search(bash_lines[i])
if not mo:
raise Exception('Regex {!r} failed on {!r}'.format(
- re_las_name.pattern, bash_lines[i]))
+ regex.pattern, bash_lines[i]))
las_fn = mo.group(1) + '.las'
result[p_id] = las_fn
return result
@@ -234,20 +234,26 @@ def get_script_xformer(pread_aln):
else:
return xform_script_for_raw_reads
+class GenomeCoverageError(Exception):
+ pass
+
def calc_cutoff_from_reverse_sorted_readlength_counts(rl_counts, target):
"""Return first read_len which gives at least 'target' bases.
"""
total = sum(pair[0]*pair[1] for pair in rl_counts)
subtotal = 0
- assert target <= total, 'Not enough genome coverage (target={} < actual={})'.format(target, total)
+ if target > total:
+ msg = 'Not enough reads available for desired genome coverage (bases needed={} > actual={})'.format(target, total)
+ raise GenomeCoverageError(msg)
cutoff = 0
for (rl, count) in rl_counts:
subtotal += rl*count
if subtotal >= target:
cutoff = rl
break
- else:
- raise Exception('Impossible target: target={target}, subtotal={subtotal}, total={total}'.format(locals()))
+ else: # pragma: no cover
+ msg = 'Impossible target (probably a bug): target={target}, subtotal={subtotal}, total={total}'.format(locals())
+ raise Exception(msg)
return cutoff
def num2int(num):
diff --git a/FALCON/falcon_kit/mains/actg_coordinate.py b/FALCON/falcon_kit/mains/actg_coordinate.py
index 52d34d1..b025d97 100644
--- a/FALCON/falcon_kit/mains/actg_coordinate.py
+++ b/FALCON/falcon_kit/mains/actg_coordinate.py
@@ -1,4 +1,4 @@
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
def main(argv=None):
@@ -19,9 +19,9 @@ def main(argv=None):
p_ctg_coor_map[ctg_id][w] = coor
- a_ctg_fasta = FastaReader("a_ctg.fa")
- for r in a_ctg_fasta:
- rid = r.name.split()
- rid, v, w = rid[:3]
- pid = rid.split("-")[0]
- print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
+ with open_fasta_reader("a_ctg.fa") as a_ctg_fasta:
+ for r in a_ctg_fasta:
+ rid = r.name.split()
+ rid, v, w = rid[:3]
+ pid = rid.split("-")[0]
+ print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
diff --git a/FALCON/falcon_kit/mains/calc_cutoff.py b/FALCON/falcon_kit/mains/calc_cutoff.py
index 4d8b974..f0492a0 100644
--- a/FALCON/falcon_kit/mains/calc_cutoff.py
+++ b/FALCON/falcon_kit/mains/calc_cutoff.py
@@ -1,6 +1,8 @@
from .. import functional as f
import argparse
+import os
import sys
+import traceback
def main(argv=sys.argv):
import argparse
@@ -15,6 +17,9 @@ This is useful when length_cutoff is not provided but the genome-size
can be estimated. The purpose is to *reduce* the amount of data seen by
DALIGNER, since otherwise it will miss many alignments when it
encounters resource limits.
+
+Note: If PBFALCON_ERRFILE is defined (and its directory is writable),
+we will write errors there in addition to stderr.
"""
parser = argparse.ArgumentParser(description=description, epilog=epilog,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -22,14 +27,25 @@ encounters resource limits.
help='Desired coverage ratio (i.e. over-sampling)')
parser.add_argument('genome_size', type=int,
help='Estimated number of bases in genome. (haploid?)')
- parser.add_argument('capture', default='-',
+ parser.add_argument('capture', #default='-', # I guess default is not allowed for required args.
help='File with captured output of DBstats. (Otherwise, stdin.)')
args = parser.parse_args(argv[1:])
target = int(args.genome_size * args.coverage)
capture = open(args.capture) if args.capture!='-' else sys.stdin
stats = capture.read()
- cutoff = f.calc_cutoff(target, stats)
+ try:
+ cutoff = f.calc_cutoff(target, stats)
+ except Exception:
+ msg = traceback.format_exc()
+ msg += 'User-provided genome_size: {}\nDesired coverage: {}\n'.format(
+ args.genome_size, args.coverage)
+ # pbfalcon wants us to write errs here.
+ errfile = os.environ.get('PBFALCON_ERRFILE')
+ if errfile:
+ with open(errfile, 'w') as ofs:
+ ofs.write(msg)
+ raise Exception(msg)
sys.stdout.write(str(cutoff))
if __name__ == "__main__":
diff --git a/FALCON/falcon_kit/mains/dedup_a_tigs.py b/FALCON/falcon_kit/mains/dedup_a_tigs.py
index b8df550..0ed036a 100644
--- a/FALCON/falcon_kit/mains/dedup_a_tigs.py
+++ b/FALCON/falcon_kit/mains/dedup_a_tigs.py
@@ -1,4 +1,4 @@
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
import argparse
import sys
@@ -13,8 +13,8 @@ def parse_args(argv):
def main(argv=sys.argv):
args = parse_args(argv)
- reads = FastaReader("a_ctg_all.fa")
- with open("a_ctg.fa","w") as f:
+ with open_fasta_reader("a_ctg_all.fa") as reads:
+ with open("a_ctg.fa","w") as f:
for r in reads:
tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
if 100*float(idt) > args.max_idt and 100*float(cov) > args.max_aln_cov and\
diff --git a/FALCON/falcon_kit/mains/fetch_reads.py b/FALCON/falcon_kit/mains/fetch_reads.py
index 9f7b458..237c6a0 100644
--- a/FALCON/falcon_kit/mains/fetch_reads.py
+++ b/FALCON/falcon_kit/mains/fetch_reads.py
@@ -1,5 +1,6 @@
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
import argparse
+import contextlib
import os
import glob
import sys
@@ -25,9 +26,9 @@ def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
rid = int(fid.split('/')[1])/10
return rid_to_oid[int(rid)]
- ref_fasta = FastaReader(ctg_fa)
- all_ctg_ids = set()
- for s in ref_fasta:
+ with open_fasta_reader(ctg_fa) as ref_fasta:
+ all_ctg_ids = set()
+ for s in ref_fasta:
s_id = s.name.split()[0]
if ctg_id != 'all' and s_id != ctg_id:
continue
@@ -81,11 +82,22 @@ def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
print >>f, ctg_id
read_out_files = {}
+ @contextlib.contextmanager
+ def reopened_fasta_out(ctg_id):
+ # A convenient closure, with a contextmanager.
+ if ctg_id not in read_out_files:
+ read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'w' )
+ read_out_files[ctg_id] = 1
+ else:
+ read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'a' )
+ yield read_out
+ read_out.close()
+
with open(read_fofn, 'r') as f:
for r_fn in f:
r_fn = r_fn.strip()
- read_fa_file = FastaReader(r_fn)
- for r in read_fa_file:
+ with open_fasta_reader(r_fn) as read_fa_file: # will soon handle .dexta too
+ for r in read_fa_file:
rid = r.name.split()[0]
if rid not in read_set:
ctg_id = 'unassigned'
@@ -95,15 +107,9 @@ def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
ctg_id = 'unassigned'
- if ctg_id not in read_out_files:
- read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'w' )
- read_out_files[ctg_id] = 1
- else:
- read_out = open( os.path.join( out_dir, '%s_reads.fa' % ctg_id), 'a' )
-
- print >>read_out, '>'+rid
- print >>read_out, r.sequence
- read_out.close()
+ with reopened_fasta_out(ctg_id) as read_out:
+ print >>read_out, '>'+rid
+ print >>read_out, r.sequence
def parse_args(argv):
parser = argparse.ArgumentParser(description='using the read to contig mapping data to partition the reads grouped by contigs')
diff --git a/FALCON/falcon_kit/mains/get_read_ctg_map.py b/FALCON/falcon_kit/mains/get_read_ctg_map.py
index 0ef0825..e96528f 100644
--- a/FALCON/falcon_kit/mains/get_read_ctg_map.py
+++ b/FALCON/falcon_kit/mains/get_read_ctg_map.py
@@ -1,8 +1,9 @@
from __future__ import absolute_import
+from .. import pype_tasks
+# pylint: disable=no-name-in-module, import-error, fixme, line-too-long
from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
makePypeLocalFile, fn, PypeTask)
PypeThreadTaskBase = MyFakePypeThreadTaskBase
-from .. import pype_tasks
import argparse
import glob
import logging
diff --git a/FALCON/falcon_kit/mains/graph_to_contig.py b/FALCON/falcon_kit/mains/graph_to_contig.py
index d004837..483709e 100644
--- a/FALCON/falcon_kit/mains/graph_to_contig.py
+++ b/FALCON/falcon_kit/mains/graph_to_contig.py
@@ -1,6 +1,6 @@
import networkx as nx
#from pbcore.io import FastaReader
-from falcon_kit.FastaReader import FastaReader
+from falcon_kit.FastaReader import open_fasta_reader
from falcon_kit import kup, falcon, DWA
read_fasta = "preads4falcon.fasta"
@@ -76,8 +76,8 @@ def main(argv=None):
seqs = {}
# load all p-read name into memory
- f = FastaReader(read_fasta)
- for r in f:
+ with open_fasta_reader(read_fasta) as f:
+ for r in f:
if r.name not in reads_in_layout:
continue
seqs[r.name] = r.sequence.upper()
diff --git a/FALCON/falcon_kit/mains/ovlp_filter.py b/FALCON/falcon_kit/mains/ovlp_filter.py
index 63d2de7..dd1a6ba 100644
--- a/FALCON/falcon_kit/mains/ovlp_filter.py
+++ b/FALCON/falcon_kit/mains/ovlp_filter.py
@@ -15,11 +15,9 @@ def filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len):
def ignore(overlap_data):
left_count = overlap_data["5p"]
right_count = overlap_data["3p"]
- if abs(left_count - right_count) > max_diff:
- return True
- elif left_count > max_ovlp or right_count > max_ovlp:
- return True
- elif left_count < min_ovlp or right_count < min_ovlp:
+ if (abs(left_count - right_count) > max_diff) or \
+ (left_count > max_ovlp) or (right_count > max_ovlp) or \
+ (left_count < min_ovlp) or (right_count < min_ovlp):
return True
ignore_rtn = []
diff --git a/FALCON/falcon_kit/mains/ovlp_to_graph.py b/FALCON/falcon_kit/mains/ovlp_to_graph.py
index 40dcd12..f8d6829 100644
--- a/FALCON/falcon_kit/mains/ovlp_to_graph.py
+++ b/FALCON/falcon_kit/mains/ovlp_to_graph.py
@@ -1,4 +1,3 @@
-#from pbcore.io import FastaReader
import networkx as nx
import os
import shlex
@@ -1043,7 +1042,7 @@ def identify_simple_paths(sg2, edge_data):
for v,w in free_edges:
if (reverse_end(w), reverse_end(v) ) not in free_edges:
print "bug", v,w
- print oreverse_end(w), reverse_end(v)
+ print reverse_end(w), reverse_end(v)
while free_edges:
if s_nodes:
diff --git a/FALCON/falcon_kit/mains/run1.py b/FALCON/falcon_kit/mains/run1.py
index 9b7bee6..247001a 100644
--- a/FALCON/falcon_kit/mains/run1.py
+++ b/FALCON/falcon_kit/mains/run1.py
@@ -1,6 +1,7 @@
from .. import run_support as support
from .. import bash, pype_tasks
-from ..util.system import only_these_symlinks
+from ..util.system import (only_these_symlinks, lfs_setstripe_maybe)
+# pylint: disable=no-name-in-module, import-error, fixme, line-too-long
from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
makePypeLocalFile, fn, PypeTask)
import argparse
@@ -13,7 +14,7 @@ import sys
import time
-fc_run_logger = logging.getLogger(__name__) # default, for remote tasks
+LOG = logging.getLogger(__name__) # default, for remote tasks
def create_daligner_tasks(basedir, scatter_fn):
@@ -111,14 +112,14 @@ def create_consensus_gather_task(wd, inputs):
def main1(prog_name, input_config_fn, logger_config_fn=None):
- global fc_run_logger
- fc_run_logger = support.setup_logger(logger_config_fn)
+ global LOG
+ LOG = support.setup_logger(logger_config_fn)
- fc_run_logger.info('fc_run started with configuration %s', input_config_fn)
+ LOG.info('fc_run started with configuration %s', input_config_fn)
try:
config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn))
except Exception:
- fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn))
+ LOG.exception('Failed to parse config "{}".'.format(input_config_fn))
raise
input_fofn_plf = makePypeLocalFile(config['input_fofn'])
genome_size = config.get('genome_size')
@@ -143,7 +144,7 @@ def run(wf, config,
):
"""
Preconditions (for now):
- * fc_run_logger
+ * LOG
* run_support.logger
"""
rawread_dir = os.path.abspath('./0-rawreads')
@@ -156,20 +157,21 @@ def run(wf, config,
support.make_dirs(d)
exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
- concurrent_jobs = config['pa_concurrent_jobs']
- wf.max_jobs = concurrent_jobs
+ wf.max_jobs = config['default_concurrent_jobs']
- rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn'])))
- make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
- outputs = {'o_fofn': rawread_fofn_plf},
- parameters = {},
- )
- fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)
-
- wf.addTasks([fofn_abs_task])
- wf.refreshTargets([fofn_abs_task])
+ assert config['input_type'] in ('raw', 'preads'), 'Invalid input_type=={!r}'.format(config['input_type'])
if config['input_type'] == 'raw':
+ rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn'])))
+ make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
+ outputs = {'o_fofn': rawread_fofn_plf},
+ parameters = {},
+ )
+ fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)
+
+ wf.addTasks([fofn_abs_task])
+ wf.refreshTargets([fofn_abs_task])
+
#### import sequences into daligner DB
sleep_done = makePypeLocalFile( os.path.join( rawread_dir, 'sleep_done') )
rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, 'rdb_build_done') )
@@ -196,6 +198,7 @@ def run(wf, config,
raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
#### run daligner
+ wf.max_jobs = config['da_concurrent_jobs']
scattered_plf = os.path.join(rawread_dir, 'daligner-scatter', 'scattered.json')
make_daligner_scatter = PypeTask(
inputs = {
@@ -234,6 +237,7 @@ def run(wf, config,
wf.refreshTargets(exitOnFailure=exitOnFailure)
# Merge .las files.
+ wf.max_jobs = config['la_concurrent_jobs']
scattered_plf = os.path.join(rawread_dir, 'merge-scatter', 'scattered.json')
make_task = PypeTask(
inputs = {
@@ -262,8 +266,7 @@ def run(wf, config,
sys.exit(0)
# Produce new FOFN of preads fasta, based on consensus of overlaps.
- concurrent_jobs = config['cns_concurrent_jobs']
- wf.max_jobs = concurrent_jobs
+ wf.max_jobs = config['cns_concurrent_jobs']
scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json')
make_task = PypeTask(
@@ -308,13 +311,13 @@ def run(wf, config,
if config['target'] == 'pre-assembly':
- log.info('Quitting after stage-0 for "pre-assembly" target.')
+ LOG.info('Quitting after stage-0 for "pre-assembly" target.')
sys.exit(0)
# build pread database
if config['input_type'] == 'preads':
preads_fofn_plf = makePypeLocalFile(os.path.join(pread_dir, 'preads-fofn-abs', os.path.basename(config['input_fofn'])))
- make_fofn_abs_task = PypeTask(inputs = {'i_fofn': rawread_fofn_plf},
+ make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
outputs = {'o_fofn': preads_fofn_plf},
parameters = {},
)
@@ -345,9 +348,7 @@ def run(wf, config,
preads_nblock = support.get_nblock(fn(preads_db))
#### run daligner
- concurrent_jobs = config['ovlp_concurrent_jobs']
- wf.max_jobs = concurrent_jobs
-
+ wf.max_jobs = config['pda_concurrent_jobs']
config['sge_option_da'] = config['sge_option_pda']
scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json')
@@ -387,6 +388,7 @@ def run(wf, config,
wf.refreshTargets(exitOnFailure=exitOnFailure)
# Merge .las files.
+ wf.max_jobs = config['pla_concurrent_jobs']
config['sge_option_la'] = config['sge_option_pla']
scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json')
make_task = PypeTask(
@@ -414,6 +416,8 @@ def run(wf, config,
wf.refreshTargets(exitOnFailure=exitOnFailure)
+ # Draft assembly (called 'fc_' for now)
+ wf.max_jobs = config['fc_concurrent_jobs']
db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
db2falcon_done = makePypeLocalFile(os.path.join(db2falcon_dir, 'db2falcon_done'))
preads4falcon_plf = makePypeLocalFile(os.path.join(db2falcon_dir, 'preads4falcon.fasta'))
@@ -451,6 +455,7 @@ def run(wf, config,
def main(argv=sys.argv):
+ lfs_setstripe_maybe(path='.', stripe=12)
parser = argparse.ArgumentParser()
parser.add_argument('config',
help='.cfg/.ini/.json')
diff --git a/FALCON/falcon_kit/pype_tasks.py b/FALCON/falcon_kit/pype_tasks.py
index 4219388..bfc6f8b 100644
--- a/FALCON/falcon_kit/pype_tasks.py
+++ b/FALCON/falcon_kit/pype_tasks.py
@@ -201,7 +201,9 @@ def task_run_las_merge(self):
else:
src = os.path.relpath(las_path, cwd)
tgt = os.path.join(cwd, os.path.basename(las_path))
- LOG.debug('symlink {!r} -> {!r}'.format(src, tgt))
+ LOG.debug('symlink {!r} <- {!r}'.format(src, tgt))
+ if os.path.lexists(tgt):
+ os.unlink(tgt)
os.symlink(src, tgt)
config = self.parameters['config']
diff --git a/FALCON/falcon_kit/run_support.py b/FALCON/falcon_kit/run_support.py
index a41f124..2e21fcd 100644
--- a/FALCON/falcon_kit/run_support.py
+++ b/FALCON/falcon_kit/run_support.py
@@ -68,7 +68,7 @@ def make_job_data(url, script_fn):
Base job_name on script_fn.
"""
wd = os.path.dirname(script_fn)
- job_name = '{0}-{1}-{1}'.format(
+ job_name = '{0}-{1}-{2}'.format(
os.path.basename(script_fn),
url.split("/")[-1],
str(uuid.uuid4())[:8],
@@ -131,6 +131,7 @@ def parse_config(config_fn):
config.readfp(open(config_fn))
return config
+import warnings
def get_dict_from_old_falcon_cfg(config):
job_type = "SGE"
section = 'General'
@@ -143,7 +144,7 @@ def get_dict_from_old_falcon_cfg(config):
else:
sge_option = config.get(section, 'sge_option_da')
- job_queue = "default"
+ job_queue = ""
if config.has_option(section, 'job_queue'):
job_queue = config.get(section, 'job_queue')
@@ -159,17 +160,33 @@ def get_dict_from_old_falcon_cfg(config):
if config.has_option(section, 'pwatcher_directory'):
pwatcher_directory = config.get(section, 'pwatcher_directory')
- pa_concurrent_jobs = default_concurrent_jobs
- if config.has_option(section, 'pa_concurrent_jobs'):
- pa_concurrent_jobs = config.getint(section, 'pa_concurrent_jobs')
-
+ da_concurrent_jobs = default_concurrent_jobs
+ la_concurrent_jobs = default_concurrent_jobs
cns_concurrent_jobs = default_concurrent_jobs
- if config.has_option(section, 'cns_concurrent_jobs'):
- cns_concurrent_jobs = config.getint(section, 'cns_concurrent_jobs')
+ pda_concurrent_jobs = default_concurrent_jobs
+ pla_concurrent_jobs = default_concurrent_jobs
+ fc_concurrent_jobs = default_concurrent_jobs
- ovlp_concurrent_jobs = default_concurrent_jobs
+ if config.has_option(section, 'pa_concurrent_jobs'):
+ pa_concurrent_jobs = config.getint(section, 'pa_concurrent_jobs')
+ warnings.warn("Deprecated setting in config: 'pa_concurrent_jobs' -- Prefer da_concurrent_jobs and la_concurrent_jobs separately")
+ da_concurrent_jobs = la_concurrent_jobs = pa_concurrent_jobs
if config.has_option(section, 'ovlp_concurrent_jobs'):
ovlp_concurrent_jobs = config.getint(section, 'ovlp_concurrent_jobs')
+ warnings.warn("Deprecated setting in config: 'ovlp_concurrent_jobs' -- Prefer pda_concurrent_jobs and pla_concurrent_jobs separately")
+ pda_concurrent_jobs = pla_concurrent_jobs = ovlp_concurrent_jobs
+ if config.has_option(section, 'da_concurrent_jobs'):
+ da_concurrent_jobs = config.getint(section, 'da_concurrent_jobs')
+ if config.has_option(section, 'la_concurrent_jobs'):
+ la_concurrent_jobs = config.getint(section, 'la_concurrent_jobs')
+ if config.has_option(section, 'cns_concurrent_jobs'):
+ cns_concurrent_jobs = config.getint(section, 'cns_concurrent_jobs')
+ if config.has_option(section, 'pda_concurrent_jobs'):
+ pda_concurrent_jobs = config.getint(section, 'pda_concurrent_jobs')
+ if config.has_option(section, 'pla_concurrent_jobs'):
+ pla_concurrent_jobs = config.getint(section, 'pla_concurrent_jobs')
+ if config.has_option(section, 'fc_concurrent_jobs'):
+ fc_concurrent_jobs = config.getint(section, 'fc_concurrent_jobs')
#appending = False
#if config.has_option(section, 'appending'):
@@ -245,6 +262,10 @@ def get_dict_from_old_falcon_cfg(config):
if config.has_option(section, 'falcon_sense_greedy'):
falcon_sense_greedy = config.getboolean(section, 'falcon_sense_greedy')
+ LA4Falcon_preload = ""
+ if config.has_option(section, 'la4falcon_preload'):
+ LA4Falcon_preload = config.getboolean(section, 'la4falcon_preload')
+
genome_size = 0
if config.has_option(section, 'genome_size'):
genome_size = config.getint(section, 'genome_size')
@@ -310,9 +331,13 @@ def get_dict_from_old_falcon_cfg(config):
"job_queue" : job_queue,
"input_type": input_type,
#"openending": openending,
- "pa_concurrent_jobs" : pa_concurrent_jobs,
- "ovlp_concurrent_jobs" : ovlp_concurrent_jobs,
+ "default_concurrent_jobs" : default_concurrent_jobs,
+ "da_concurrent_jobs" : da_concurrent_jobs,
+ "la_concurrent_jobs" : la_concurrent_jobs,
"cns_concurrent_jobs" : cns_concurrent_jobs,
+ "pda_concurrent_jobs" : pda_concurrent_jobs,
+ "pla_concurrent_jobs" : pla_concurrent_jobs,
+ "fc_concurrent_jobs" : fc_concurrent_jobs,
"overlap_filtering_setting": overlap_filtering_setting,
"genome_size" : genome_size,
"seed_coverage" : seed_coverage,
@@ -338,6 +363,7 @@ def get_dict_from_old_falcon_cfg(config):
"falcon_sense_option": falcon_sense_option,
"falcon_sense_skip_contained": falcon_sense_skip_contained,
"falcon_sense_greedy": falcon_sense_greedy,
+ "LA4Falcon_preload": LA4Falcon_preload,
"stop_all_jobs_on_failure": stop_all_jobs_on_failure,
"use_tmpdir": use_tmpdir,
"pwatcher_type": pwatcher_type,
@@ -347,7 +373,6 @@ def get_dict_from_old_falcon_cfg(config):
provided = dict(config.items(section))
unused = set(provided) - set(k.lower() for k in hgap_config)
if unused:
- import warnings
warnings.warn("Unexpected keys in input config: %s" %repr(unused))
hgap_config["install_prefix"] = sys.prefix
diff --git a/FALCON/falcon_kit/stats_preassembly.py b/FALCON/falcon_kit/stats_preassembly.py
index f322b7f..623378a 100644
--- a/FALCON/falcon_kit/stats_preassembly.py
+++ b/FALCON/falcon_kit/stats_preassembly.py
@@ -7,7 +7,7 @@ See FALCON-pbsmrtpipe/pbfalcon/report_preassembly.py for XML version.
# http://swarm/files/depot/branches/springfield/S2.3/software/smrtanalysis/bioinformatics/tools/pbreports/pbreports/report/preassembly.py
from __future__ import absolute_import
from __future__ import division
-from .FastaReader import FastaReader
+from .FastaReader import open_fasta_reader
from .util.io import syscall
from . import functional
import collections
@@ -21,7 +21,7 @@ import re
log = logging.getLogger(__name__)
__version__ = '0.1'
-Stats = collections.namedtuple('FastaStats', ['nreads', 'total', 'n50', 'p95'])
+Stats = collections.namedtuple('FastaStats', ['nreads', 'total', 'n50', 'p95', 'esize'])
# Copied from pbreports/util.py
# We want to avoid a dependency on pbreports b/c it needs matplotlib.
@@ -31,7 +31,7 @@ def get_fasta_readlengths(fasta_file):
:return: (tuple)
"""
lens = []
- with FastaReader(fasta_file) as f:
+ with open_fasta_reader(fasta_file) as f:
for record in f:
lens.append(len(record.sequence))
lens.sort()
@@ -88,11 +88,13 @@ def percentile(read_lens, p):
def stats_from_sorted_readlengths(read_lens):
nreads = len(read_lens)
total = sum(read_lens)
+ sum_squares = sum(r*r for r in read_lens)
n50 = read_len_above(read_lens, int(total * 0.50))
p95 = percentile(read_lens, 0.95)
+ esize = sum_squares / total
#alt_n50 = pbreports.util.compute_n50(read_lens)
#log.info('our n50=%s, pbreports=%s' %(n50, alt_n50)) # Ours is more correct when median is between 2 reads.
- return Stats(nreads=nreads, total=total, n50=n50, p95=p95)
+ return Stats(nreads=nreads, total=total, n50=n50, p95=p95, esize=esize)
def read_lens_from_fofn(fofn_fn):
"""Return sorted list.
@@ -152,18 +154,21 @@ def stats_dict(stats_raw_reads, stats_seed_reads, stats_corrected_reads, genome_
kwds['raw_n50'] = stats_raw_reads.n50
kwds['raw_p95'] = stats_raw_reads.p95
kwds['raw_coverage'] = stats_raw_reads.total / genome_length
+ kwds['raw_esize'] = stats_raw_reads.esize
kwds['seed_reads'] = stats_seed_reads.nreads
kwds['seed_bases'] = stats_seed_reads.total
kwds['seed_mean'] = stats_seed_reads.total / stats_seed_reads.nreads
kwds['seed_n50'] = stats_seed_reads.n50
kwds['seed_p95'] = stats_seed_reads.p95
kwds['seed_coverage'] = stats_seed_reads.total / genome_length
+ kwds['seed_esize'] = stats_seed_reads.esize
kwds['preassembled_reads'] = stats_corrected_reads.nreads
kwds['preassembled_bases'] = stats_corrected_reads.total
kwds['preassembled_mean'] = stats_corrected_reads.total / stats_corrected_reads.nreads
kwds['preassembled_n50'] = stats_corrected_reads.n50
kwds['preassembled_p95'] = stats_corrected_reads.p95
kwds['preassembled_coverage'] = stats_corrected_reads.total / genome_length
+ kwds['preassembled_esize'] = stats_corrected_reads.esize
kwds['preassembled_yield'] = stats_corrected_reads.total / stats_seed_reads.total
kwds['preassembled_seed_fragmentation'] = fragmentation
kwds['preassembled_seed_truncation'] = truncation
@@ -178,6 +183,8 @@ def make_dict(
i_raw_reads_fofn_fn,
genome_length,
length_cutoff,
+ fragmentation=-1,
+ truncation=-1,
):
raw_reads = read_lens_from_fofn(i_raw_reads_fofn_fn)
stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
@@ -193,6 +200,8 @@ def make_dict(
stats_corrected_reads=stats_preads,
genome_length=genome_length,
length_cutoff=length_cutoff,
+ fragmentation=fragmentation,
+ truncation=truncation,
)
return report_dict
diff --git a/FALCON/falcon_kit/util/system.py b/FALCON/falcon_kit/util/system.py
index 5748a24..2f029d6 100644
--- a/FALCON/falcon_kit/util/system.py
+++ b/FALCON/falcon_kit/util/system.py
@@ -1,3 +1,4 @@
+from .io import system
import logging
import os
import pprint
@@ -30,3 +31,11 @@ def only_these_symlinks(dir2paths):
for base, rel in base2rel.iteritems():
path = os.path.join(d, base)
os.symlink(rel, path)
+
+def lfs_setstripe_maybe(path='.', stripe=12):
+ path = os.path.abspath(path)
+ rc = system('lfs setstripe -c {:d} {!s}'.format(stripe, path))
+ if rc:
+ log.info('Apparently {!r} is not lustre in filesystem.'.format(path))
+ else:
+ log.info('This lfs stripe ({}) should propagate to subdirs of {!r}.'.format(stripe, path))
diff --git a/FALCON/makefile b/FALCON/makefile
new file mode 100644
index 0000000..8b138c4
--- /dev/null
+++ b/FALCON/makefile
@@ -0,0 +1,65 @@
+# Feel free to override this.
+ifndef PYTHONUSERBASE
+ PYTHONUSERBASE:=LOCAL
+ PATH:=${PYTHONUSERBASE}/bin:${PATH}
+ export PYTHONUSERBASE
+ export PATH
+endif
+export COVERAGE_PROCESS_START
+
+MY_TEST_FLAGS?=-v -s
+
+install-edit:
+ pip -v install --user --edit .
+install: wheel
+ pip -v install --user --use-wheel --find-links=dist/ .
+pylint:
+ pylint --errors-only falcon_kit/
+test:
+ python -c 'import falcon_kit; print falcon_kit.falcon'
+ #pip install --user pytest
+ py.test ${MY_TEST_FLAGS} --junit-xml=test.basic.xml test/
+ py.test ${MY_TEST_FLAGS} --junit-xml=test.doctest.xml --doctest-modules falcon_kit/functional.py
+ cp -f test.basic.xml nose.basic.xml
+ cp -f test.doctest.xml nose.doctest.xml
+coverage:
+ make coverage-clean
+ #pip install --user coverage
+ COVERAGE_PROCESS_START=${PWD}/mycoverage.cfg ${MAKE} coverage-actual
+coverage-actual: test
+ ls -larth
+ coverage combine
+ ls -larth
+ coverage xml -o coverage.xml
+ sed -i -e 's at filename="@filename="./@g' coverage.xml
+ coverage report -m
+coverage-clean:
+ rm -f .coverage* coverage.xml
+coverage-install:
+ # This is needed only if you run from a different directory, since ./sitecustomize.py
+ # would not be in 'sys.path'.
+ # Assume PYTHONUSERBASE is set.
+ mkdir -p ${PYTHONUSERBASE}/lib/python2.7/site-packages
+ ln -f mysitecustomize.py ${PYTHONUSERBASE}/lib/python2.7/site-packages/sitecustomize.py
+coverage-uninstall:
+ rm -f ${PYTHONUSERBASE}/lib/python2.7/site-packages/sitecustomize.py*
+
+# We cannot run doctests on *all* modules because some include dependencies.
+# Just pypeFLOW for now, but I would rather not test dependencies anyway.
+
+wheel:
+ pip install --upgrade --user pip
+ python setup.py bdist_wheel
+# Look for dist/*.whl
+
+tar:
+ rm -f FALCON.tar.gz
+ tar cvzf FALCON.tar.gz -C ${PYTHONUSERBASE} .
+# Much smaller than the wheel, and includes all necessary dependencies,
+# but also includes anything already in the user-site.
+
+clean: coverage-clean
+ \rm -f *.xml
+
+
+.PHONY: install test install-no-edit wheel coverage tar clean
diff --git a/FALCON/mycoverage.cfg b/FALCON/mycoverage.cfg
new file mode 100644
index 0000000..9de3ac8
--- /dev/null
+++ b/FALCON/mycoverage.cfg
@@ -0,0 +1,5 @@
+[run]
+branch = True
+data_file = ${PWD}/.coverage
+parallel = True
+source = falcon_kit
diff --git a/FALCON/mysitecustomize.py b/FALCON/mysitecustomize.py
new file mode 100644
index 0000000..e060ab1
--- /dev/null
+++ b/FALCON/mysitecustomize.py
@@ -0,0 +1,3 @@
+#import site; site.addsitedir('...')
+#raise Exception('WHERE')
+import coverage; coverage.process_startup()
diff --git a/FALCON/setup.py b/FALCON/setup.py
index b6f893c..d189809 100755
--- a/FALCON/setup.py
+++ b/FALCON/setup.py
@@ -8,6 +8,7 @@ install_requires=[
"networkx >=1.7, <=1.10",
#"logging_tree",
#"pbcore >= 0.6.3",
+ #"pypeFLOW", # We exclude pypeFLOW because it is not needed for the unit-tests.
]
scripts = glob.glob("src/py_scripts/*.py")
diff --git a/FALCON/src/c/falcon.c b/FALCON/src/c/falcon.c
index 11740a1..ce679ac 100755
--- a/FALCON/src/c/falcon.c
+++ b/FALCON/src/c/falcon.c
@@ -147,6 +147,8 @@ align_tags_t * get_align_tags( char * aln_q_seq,
p_j = j;
p_jj = jj;
p_q_base = aln_q_seq[k];
+ } else {
+ break; // when there is a big alignment gap > UINT8_MAX, stop to extned the tagging string
}
}
// sentinal at the end
@@ -316,13 +318,13 @@ consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
consensus_data * consensus;
//char * consensus;
align_tag_t * c_tag;
- static msa_pos_t * msa_array = NULL;
coverage = calloc( t_len, sizeof(unsigned int) );
local_nbase = calloc( t_len, sizeof(unsigned int) );
#ifndef STATIC_ALLOCATE
+ msa_pos_t * msa_array = NULL; // For more efficiency, this should be injected.
msa_array = calloc(t_len, sizeof(msa_pos_t *));
for (i = 0; i < t_len; i++) {
@@ -331,10 +333,9 @@ consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
allocate_delta_group(msa_array[i]);
}
-#endif
-
-#ifdef STATIC_ALLOCATE
+#else
+ static msa_pos_t * msa_array = NULL;
if ( msa_array == NULL) {
msa_array = get_msa_working_sapce( 100000 );
}
@@ -547,9 +548,7 @@ consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
}
free(msa_array);
-#endif
-
-#ifdef STATIC_ALLOCATE
+#else
clean_msa_working_space(msa_array, t_len+1);
#endif
diff --git a/FALCON/src/py_scripts_v0.1/falcon_asm.py b/FALCON/src/py_scripts_v0.1/falcon_asm.py
deleted file mode 100755
index 0b632e8..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_asm.py
+++ /dev/null
@@ -1,1154 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from pbcore.io import FastaReader
-import networkx as nx
-import os
-import shlex
-import sys
-import subprocess
-
-DEBUG_LOG_LEVEL = 0
-
-class SGNode(object):
- """
- class representing a node in the string graph
- """
- def __init__(self, node_name):
- self.name = node_name
- self.out_edges = []
- self.in_edges = []
- def add_out_edge(self, out_edge):
- self.out_edges.append(out_edge)
- def add_in_edge(self, in_edge):
- self.in_edges.append(in_edge)
-
-class SGEdge(object):
- """
- class representing an edge in the string graph
- """
- def __init__(self, in_node, out_node):
- self.in_node = in_node
- self.out_node = out_node
- self.attr = {}
- def set_attribute(self, attr, value):
- self.attr[attr] = value
-
-def reverse_end( node_id ):
- node_id, end = node_id.split(":")
- new_end = "B" if end == "E" else "E"
- return node_id + ":" + new_end
-
-class StringGraph(object):
- """
- class representing the string graph
- """
- def __init__(self):
- self.nodes = {}
- self.edges = {}
- self.n_mark = {}
- self.e_reduce = {}
- self.repeat_overlap = {}
-
- def add_node(self, node_name):
- """
- add a node into the graph by given a node name
- """
- if node_name not in self.nodes:
- self.nodes[node_name] = SGNode(node_name)
-
- def add_edge(self, in_node_name, out_node_name, **attributes):
- """
- add an edge into the graph by given a pair of nodes
- """
- if (in_node_name, out_node_name) not in self.edges:
-
- self.add_node(in_node_name)
- self.add_node(out_node_name)
- in_node = self.nodes[in_node_name]
- out_node = self.nodes[out_node_name]
-
- edge = SGEdge(in_node, out_node)
- self.edges[ (in_node_name, out_node_name) ] = edge
- in_node.add_out_edge(edge)
- out_node.add_in_edge(edge)
- edge = self.edges[ (in_node_name, out_node_name) ]
- for k, v in attributes.items():
- edge.attr[k] = v
-
- def init_reduce_dict(self):
- for e in self.edges:
- self.e_reduce[e] = False
-
- def mark_chimer_edge(self):
-
- for e_n, e in self.edges.items():
- v = e_n[0]
- w = e_n[1]
- overlap_count = 0
- for w_out_e in self.nodes[w].out_edges:
- w_out_n = w_out_e.out_node.name
- if (v, w_out_n) in self.edges:
- overlap_count += 1
- for v_in_e in self.nodes[v].in_edges:
- v_in_n = v_in_e.in_node.name
- if (v_in_n, w) in self.edges:
- overlap_count += 1
- if self.e_reduce[ (v, w) ] != True:
- if overlap_count == 0:
- self.e_reduce[(v, w)] = True
- #print "XXX: chimer edge %s %s removed" % (v, w)
- v, w = reverse_end(w), reverse_end(v)
- self.e_reduce[(v, w)] = True
- #print "XXX: chimer edge %s %s removed" % (v, w)
-
-
-
- def mark_spur_edge(self):
-
- for v in self.nodes:
- if len(self.nodes[v].out_edges) > 1:
- for out_edge in self.nodes[v].out_edges:
- w = out_edge.out_node.name
-
- if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
- #print "XXX: spur edge %s %s removed" % (v, w)
- self.e_reduce[(v, w)] = True
- v2, w2 = reverse_end(w), reverse_end(v)
- #print "XXX: spur edge %s %s removed" % (v2, w2)
- self.e_reduce[(v, w)] = True
-
- if len(self.nodes[v].in_edges) > 1:
- for in_edge in self.nodes[v].in_edges:
- w = in_edge.in_node.name
- if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
- #print "XXX: spur edge %s %s removed" % (w, v)
- self.e_reduce[(w, v)] = True
- v2, w2 = reverse_end(w), reverse_end(v)
- #print "XXX: spur edge %s %s removed" % (w2, v2)
- self.e_reduce[(w, v)] = True
-
-
- def mark_tr_edges(self):
- """
- transitive reduction
- """
- n_mark = self.n_mark
- e_reduce = self.e_reduce
- FUZZ = 500
- for n in self.nodes:
- n_mark[n] = "vacant"
-
- for n_name, node in self.nodes.items():
-
- out_edges = node.out_edges
- if len(out_edges) == 0:
- continue
-
- out_edges.sort(key=lambda x: x.attr["length"])
-
- for e in out_edges:
- w = e.out_node
- n_mark[ w.name ] = "inplay"
-
- max_len = out_edges[-1].attr["length"]
-
- max_len += FUZZ
-
- for e in out_edges:
- e_len = e.attr["length"]
- w = e.out_node
- if n_mark[w.name] == "inplay":
- w.out_edges.sort( key=lambda x: x.attr["length"] )
- for e2 in w.out_edges:
- if e2.attr["length"] + e_len < max_len:
- x = e2.out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
-
- for e in out_edges:
- e_len = e.attr["length"]
- w = e.out_node
- w.out_edges.sort( key=lambda x: x.attr["length"] )
- if len(w.out_edges) > 0:
- x = w.out_edges[0].out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
- for e2 in w.out_edges:
- if e2.attr["length"] < FUZZ:
- x = e2.out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
-
- for out_edge in out_edges:
- v = out_edge.in_node
- w = out_edge.out_node
- if n_mark[w.name] == "eliminated":
- e_reduce[ (v.name, w.name) ] = True
- #print "XXX: tr edge %s %s removed" % (v.name, w.name)
- v_name, w_name = reverse_end(w.name), reverse_end(v.name)
- e_reduce[(v_name, w_name)] = True
- #print "XXX: tr edge %s %s removed" % (v_name, w_name)
- n_mark[w.name] = "vacant"
-
-
- def mark_best_overlap(self):
- """
- find the best overlapped edges
- """
-
- best_edges = set()
-
- for v in self.nodes:
-
- out_edges = self.nodes[v].out_edges
- if len(out_edges) > 0:
- out_edges.sort(key=lambda e: e.attr["score"])
- e = out_edges[-1]
- best_edges.add( (e.in_node.name, e.out_node.name) )
-
- in_edges = self.nodes[v].in_edges
- if len(in_edges) > 0:
- in_edges.sort(key=lambda e: e.attr["score"])
- e = in_edges[-1]
- best_edges.add( (e.in_node.name, e.out_node.name) )
-
- if DEBUG_LOG_LEVEL > 1:
- print "X", len(best_edges)
-
- for e_n, e in self.edges.items():
- v = e_n[0]
- w = e_n[1]
- if self.e_reduce[ (v, w) ] != True:
- if (v, w) not in best_edges:
- self.e_reduce[(v, w)] = True
- #print "XXX: in best edge %s %s removed" % (v, w)
- v2, w2 = reverse_end(w), reverse_end(v)
- #print "XXX: in best edge %s %s removed" % (v2, w2)
- self.e_reduce[(v2, w2)] = True
-
- def get_out_edges_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].out_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- return rtn
-
-
- def get_in_edges_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].in_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- return rtn
-
- def get_best_out_edge_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].out_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- rtn.sort(key=lambda e: e.attr["score"])
-
- return rtn[-1]
-
- def get_best_in_edge_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].in_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- rtn.sort(key=lambda e: e.attr["score"])
- return rtn[-1]
-
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-def generate_seq_from_path(sg, seqs, path):
- subseqs = []
- r_id, end = path[0].split(":")
-
- count = 0
- for i in range( len( path ) -1 ):
- w_n, v_n = path[i:i+2]
- edge = sg.edges[ (w_n, v_n ) ]
- read_id, coor = edge.attr["label"].split(":")
- b,e = coor.split("-")
- b = int(b)
- e = int(e)
- if b < e:
- subseqs.append( seqs[read_id][b:e] )
- else:
- subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
-
- return "".join(subseqs)
-
-
-def reverse_path( path ):
- new_path = []
- for n in list(path[::-1]):
- rid, end = n.split(":")
- new_end = "B" if end == "E" else "E"
- new_path.append( rid+":"+new_end)
- return new_path
-
-
-def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
-
- """
- given a string graph:sg and the sequences: seqs, write the unitig fasta file into out_fn
- the funtion return a reduct graph representing the reduce string graph where the edges are unitigs
-
- some extra files generated:
- unit_edges.dat : an easy to parse file for unitig data
- unit_edge_paths : the file contains the information of the path of all unitigs
- uni_graph.gexf: the unitig graph in gexf format for visulization
- """
-
- G = SGToNXG(sg)
- if connected_nodes != None:
- connected_nodes = set(sg.nodes)
- out_fasta = open(out_fn, "w")
- nodes_for_tig = set()
- sg_edges = set()
- for v, w in sg.edges:
- if sg.e_reduce[(v, w)] != True:
- sg_edges.add( (v, w) )
- count = 0
- edges_in_tigs = set()
-
- uni_edges = {}
- path_f = open("unit_edge_paths","w")
- uni_edge_f = open("unit_edges.dat", "w")
- while len(sg_edges) > 0:
- v, w = sg_edges.pop()
-
- #nodes_for_tig.remove(n)
- upstream_nodes = []
-
- c_node = v
- p_in_edges = sg.get_in_edges_for_node(c_node)
- p_out_edges = sg.get_out_edges_for_node(c_node)
- while len(p_in_edges) == 1 and len(p_out_edges) == 1:
- p_node = p_in_edges[0].in_node
- upstream_nodes.append(p_node.name)
- if (p_node.name, c_node) not in sg_edges:
- break
- p_in_edges = sg.get_in_edges_for_node(p_node.name)
- p_out_edges = sg.get_out_edges_for_node(p_node.name)
- c_node = p_node.name
-
- upstream_nodes.reverse()
-
- downstream_nodes = []
- c_node = w
- n_out_edges = sg.get_out_edges_for_node(c_node)
- n_in_edges = sg.get_in_edges_for_node(c_node)
- while len(n_out_edges) == 1 and len(n_in_edges) == 1:
- n_node = n_out_edges[0].out_node
- downstream_nodes.append(n_node.name)
- if (c_node, n_node.name) not in sg_edges:
- break
- n_out_edges = sg.get_out_edges_for_node(n_node.name)
- n_in_edges = sg.get_in_edges_for_node(n_node.name)
- c_node = n_node.name
-
- whole_path = upstream_nodes + [v, w] + downstream_nodes
- count += 1
- subseq = generate_seq_from_path(sg, seqs, whole_path)
- uni_edges.setdefault( (whole_path[0], whole_path[-1]), [] )
- uni_edges[(whole_path[0], whole_path[-1])].append( ( whole_path, subseq ) )
- print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), subseq
- print >>path_f, ">%05dc-%s-%s-%d %s" % (count, whole_path[0], whole_path[-1], len(whole_path), " ".join(whole_path))
- print >>out_fasta, ">%05dc-%s-%s-%d" % (count, whole_path[0], whole_path[-1], len(whole_path))
- print >>out_fasta, subseq
- for i in range( len( whole_path ) -1 ):
- w_n, v_n = whole_path[i:i+2]
- try:
- sg_edges.remove( (w_n, v_n) )
- except KeyError: #if an edge is already deleted, ignore it
- pass
-
- r_whole_path = reverse_path( whole_path )
- count += 1
- subseq = generate_seq_from_path(sg, seqs, r_whole_path)
- uni_edges.setdefault( (r_whole_path[0], r_whole_path[-1]), [] )
- uni_edges[(r_whole_path[0], r_whole_path[-1])].append( ( r_whole_path, subseq ) )
- print >> uni_edge_f, r_whole_path[0], r_whole_path[-1], "-".join(r_whole_path), subseq
- print >>path_f, ">%05dc-%s-%s-%d %s" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path), " ".join(r_whole_path))
- print >>out_fasta, ">%05dc-%s-%s-%d" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path))
- print >>out_fasta, subseq
- for i in range( len( r_whole_path ) -1 ):
- w_n, v_n = r_whole_path[i:i+2]
- try:
- sg_edges.remove( (w_n, v_n) )
- except KeyError: #if an edge is already deleted, ignore it
- pass
-
-
- path_f.close()
- uni_edge_f.close()
- #uni_graph = nx.DiGraph()
- #for n1, n2 in uni_edges.keys():
- # uni_graph.add_edge(n1, n2, count = len( uni_edges[ (n1,n2) ] ))
- #nx.write_gexf(uni_graph, "uni_graph.gexf")
-
- out_fasta.close()
- return uni_edges
-
-def neighbor_bound(G, v, w, radius):
- """
- test if the node v and the node w are connected within a radius in graph G
- """
- g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
- g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
- if len(set(g1.edges()) & set(g2.edges())) > 0:
- return True
- else:
- return False
-
-
-def is_branch_node(G, n):
- """
- test whether the node n is a "branch node" which the paths from any of two of
- its offsprings do not intersect within a given radius
- """
- out_edges = G.out_edges([n])
- n2 = [ e[1] for e in out_edges ]
- is_branch = False
- for i in range(len(n2)):
- for j in range(i+1, len(n2)):
- v = n2[i]
- w = n2[j]
- if neighbor_bound(G, v, w, 10) == False:
- is_branch = True
- break
- if is_branch == True:
- break
- return is_branch
-
-
-def get_bundle( path, u_graph ):
-
- """
- find a sub-graph contain the nodes between the start and the end of the path
- inputs:
- u_graph : a unitig graph
- returns:
- bundle_graph: the whole bundle graph
- bundle_paths: the paths in the bundle graph
- sub_graph2_edges: all edges of the bundle graph
-
- """
-
- p_start, p_end = path[0], path[-1]
- p_nodes = set(path)
- p_edges = set(zip(path[:-1], path[1:]))
-
- u_graph_r = u_graph.reverse()
- down_path = nx.ego_graph(u_graph, p_start, radius=len(p_nodes), undirected=False)
- up_path = nx.ego_graph(u_graph_r, p_end, radius=len(p_nodes), undirected=False)
- subgraph_nodes = set(down_path) & set(up_path)
-
-
- sub_graph = nx.DiGraph()
- for v, w in u_graph.edges_iter():
- if v in subgraph_nodes and w in subgraph_nodes:
- if (v, w) in p_edges:
- sub_graph.add_edge(v, w, color = "red")
- else:
- sub_graph.add_edge(v, w, color = "black")
-
- sub_graph2 = nx.DiGraph()
- tips = set()
- tips.add(path[0])
- sub_graph_r = sub_graph.reverse()
- visited = set()
- ct = 0
- is_branch = is_branch_node(sub_graph, path[0]) #if the start node is a branch node
- if is_branch:
- n = tips.pop()
- e = sub_graph.out_edges([n])[0] #pick one path the build the subgraph
- sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
- if e[1] not in visited:
- last_node = e[1]
- visited.add(e[1])
- r_id, orientation = e[1].split(":")
- orientation = "E" if orientation == "B" else "E"
- visited.add( r_id +":" + orientation)
- if not is_branch_node(sub_graph_r, e[1]):
- tips.add(e[1])
-
- while len(tips) != 0:
- n = tips.pop()
- out_edges = sub_graph.out_edges([n])
- if len(out_edges) == 1:
- e = out_edges[0]
- sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
- last_node = e[1]
- if e[1] not in visited:
- visited.add(e[1])
- r_id, orientation = e[1].split(":")
- orientation = "E" if orientation == "B" else "E"
- visited.add( r_id +":" + orientation)
- if not is_branch_node(sub_graph_r, e[1]):
- tips.add(e[1])
- else:
-
- is_branch = is_branch_node(sub_graph, n)
- if not is_branch:
- for e in out_edges:
- sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
- last_node = e[1]
- if e[1] not in visited:
- r_id, orientation = e[1].split(":")
- visited.add(e[1])
- orientation = "E" if orientation == "B" else "E"
- visited.add( r_id +":" + orientation)
- if not is_branch_node(sub_graph_r, e[1]):
- tips.add(e[1])
- ct += 1
- last_node = None
- longest_len = 0
-
- sub_graph2_nodes = sub_graph2.nodes()
- sub_graph2_edges = sub_graph2.edges()
-
-
- new_path = [path[0]]
- for n in sub_graph2_nodes:
- if len(sub_graph2.out_edges(n)) == 0 :
- path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
- path_len = len(path_t)
- if path_len > longest_len:
- last_node = n
- longest_len = path_len
- new_path = path_t
-
- if last_node == None:
- for n in sub_graph2_nodes:
- path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
- path_len = len(path_t)
- if path_len > longest_len:
- last_node = n
- longest_len = path_len
- new_path = path_t
-
-
- path = new_path
-
- # clean up sub_graph2 according to new begin and end
- sub_graph2_r = sub_graph2.reverse()
- down_path = nx.ego_graph(sub_graph2, path[0], radius=len(path), undirected=False)
- up_path = nx.ego_graph(sub_graph2_r, path[-1], radius=len(path), undirected=False)
- subgraph_nodes = set(down_path) & set(up_path)
- for v in sub_graph2_nodes:
- if v not in subgraph_nodes:
- sub_graph2.remove_node(v)
-
- if DEBUG_LOG_LEVEL > 1:
- print "new_path", path[0], last_node, len(sub_graph2_nodes), path
-
-
- bundle_paths = [path]
- p_nodes = set(path)
- p_edges = set(zip(path[:-1], path[1:]))
-
- sub_graph2_nodes = sub_graph2.nodes()
- sub_graph2_edges = sub_graph2.edges()
-
- nodes_idx = dict( [ (n[1], n[0]) for n in enumerate(path) ] )
-
-
- # create a list of subpath that has no branch
- non_branch_subpaths = []
- wi = 0
- vi = 0
- v = path[0]
- while v != path[-1] and wi < len(path)-1:
- wi += 1
- w = path[wi]
- while len( sub_graph2.successors(w) ) == 1 and len( sub_graph2.predecessors(w) ) == 1 and wi < len(path)-1:
- wi += 1
- w = path[wi]
- if len( sub_graph2.successors(v) )!= 1 or len( sub_graph2.predecessors(w) )!= 1:
- branched = True
- else:
- branched = False
-
- if not branched:
- non_branch_subpaths.append( path[vi:wi+1] )
- v = w
- vi = wi
-
- # create the accompany_graph that has the path of the alternative subpaths
-
- associate_graph = nx.DiGraph()
- for v, w in sub_graph2.edges_iter():
- if (v, w) not in p_edges:
- associate_graph.add_edge(v, w, n_weight = sub_graph2[v][w]["n_weight"])
-
- if DEBUG_LOG_LEVEL > 1:
- print "associate_graph size:", len(associate_graph)
- print "non_branch_subpaths",len(non_branch_subpaths), non_branch_subpaths
-
- # construct the bundle graph
- associate_graph_nodes = set(associate_graph.nodes())
- bundle_graph = nx.DiGraph()
- bundle_graph.add_path( path )
- for i in range(len(non_branch_subpaths)-1):
- if len(non_branch_subpaths[i]) == 0 or len( non_branch_subpaths[i+1] ) == 0:
- continue
- e1, e2 = non_branch_subpaths[i: i+2]
- v = e1[-1]
- w = e2[0]
- if v == w:
- continue
- in_between_node_count = nodes_idx[w] - nodes_idx[v]
- if v in associate_graph_nodes and w in associate_graph_nodes:
- try:
- a_path = nx.shortest_path(associate_graph, v, w, "n_weight")
- except nx.NetworkXNoPath:
- continue
- bundle_graph.add_path( a_path )
- bundle_paths.append( a_path )
-
- return bundle_graph, bundle_paths, sub_graph2_edges
-
-def get_bundles(u_edges):
-
- """
- input: all unitig edges
- output: the assembled primary_tigs.fa and all_tigs.fa
- """
-
- ASM_graph = nx.DiGraph()
- out_f = open("primary_tigs.fa", "w")
- main_tig_paths = open("primary_tigs_paths","w")
- sv_tigs = open("all_tigs.fa","w")
- sv_tig_paths = open("all_tigs_paths","w")
- max_weight = 0
- for v, w in u_edges:
- x = max( [len(s[1]) for s in u_edges[ (v,w) ] ] )
- if DEBUG_LOG_LEVEL > 1:
- print "W", v, w, x
- if x > max_weight:
- max_weight = x
-
- in_edges = {}
- out_edges = {}
- for v, w in u_edges:
- in_edges.setdefault(w, [])
- out_edges.setdefault(w, [])
- in_edges[w].append( (v, w) )
-
- out_edges.setdefault(v, [])
- in_edges.setdefault(v, [])
- out_edges[v].append( (v, w) )
-
- u_graph = nx.DiGraph()
- for v,w in u_edges:
-
- u_graph.add_edge(v, w, n_weight = max_weight - max( [len(s[1]) for s in u_edges[ (v,w) ] ] ) )
-
- bundle_edge_out = open("bundle_edges","w")
- bundle_index = 0
- G = u_graph.copy()
- visited_u_edges = set()
- while len(G) > 0:
-
- root_nodes = set()
- for n in G:
- if G.in_degree(n) == 0:
- root_nodes.add(n)
-
- if len(root_nodes) == 0:
- if G.in_degree(n) != 1:
- root_nodes.add(n)
-
- if len(root_nodes) == 0:
- root_nodes.add( G.nodes()[0] )
-
- candidates = []
-
- for n in list(root_nodes):
- sp =nx.single_source_shortest_path_length(G, n)
- sp = sp.items()
- sp.sort(key=lambda x : x[1])
- longest = sp[-1]
- if DEBUG_LOG_LEVEL > 2:
- print "L", n, longest[0]
- if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop
- continue
- candidates.append ( (longest[1], n, longest[0]) )
-
- if len(candidates) == 0:
- print "no more candiate", len(G.edges()), len(G.nodes())
- if len(G.edges()) > 0:
- path = G.edges()[0]
- print path
- else:
- break
- else:
- candidates.sort()
-
- candidate = candidates[-1]
-
- if candidate[1] == candidate[2]:
- G.remove_node(candidate[1])
- continue
-
- path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight")
-
- if DEBUG_LOG_LEVEL > 1:
- print "X", path[0], path[-1], len(path)
-
- cmp_edges = set()
- g_edges = set(G.edges())
- new_path = []
- tail = True
- # avioid confusion due to long palindrome sequence
- if len(path) > 2:
- for i in range( 0, len( path ) - 1 ):
- v_n, w_n = path[i:i+2]
- new_path.append(v_n)
- # the comment out code below might be useful for filter out some high connectivity nodes
- #if (v_n, w_n) in cmp_edges or\
- # len(u_graph.out_edges(w_n)) > 5 or\
- # len(u_graph.in_edges(w_n)) > 5:
- if (v_n, w_n) in cmp_edges:
- tail = False
- break
-
- r_id, end = v_n.split(":")
- end = "E" if end == "B" else "B"
- v_n2 = r_id + ":" + end
-
- r_id, end = w_n.split(":")
- end = "E" if end == "B" else "B"
- w_n2 = r_id + ":" + end
-
- if (w_n2, v_n2) in g_edges:
- cmp_edges.add( (w_n2, v_n2) )
-
- if tail:
- new_path.append(w_n)
- else:
- new_path = path[:]
-
-
- if len(new_path) > 1:
- path = new_path
-
- if DEBUG_LOG_LEVEL > 2:
- print "Y", path[0], path[-1], len(path)
-
- bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G )
- for bg_edge in bundle_graph_edges:
- print >> bundle_edge_out, bundle_index, "edge", bg_edge[0], bg_edge[1]
- for path_ in bundle_paths:
- print >>bundle_edge_out, "path", bundle_index, " ".join(path_)
-
- edges_to_be_removed = set()
- if DEBUG_LOG_LEVEL > 2:
- print "Z", bundle_paths[0][0], bundle_paths[0][-1]
- print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
-
- if len(bundle_graph_edges) > 0:
-
- ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
- extra_u_edges = []
-
- print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
- subseqs = []
-
- for i in range(len(bundle_paths[0]) - 1):
- v, w = bundle_paths[0][i:i+2]
- edges_to_be_removed.add( (v,w) )
- uedges = u_edges[ (v,w) ]
- uedges.sort( key= lambda x: len(x[0]) )
- subseqs.append( uedges[-1][1] )
- visited_u_edges.add( "-".join(uedges[-1][0]) )
- for ue in uedges:
- if "-".join(ue[0]) not in visited_u_edges:
- visited_u_edges.add("-".join(ue[0]))
- extra_u_edges.append(ue)
- seq = "".join(subseqs)
- sv_tig_idx = 0
- print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(bundle_paths[0]) )
- if len(seq) > 0:
- print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
- print >> out_f, seq
- print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, bundle_paths[0][0], bundle_paths[0][-1])
- print >> sv_tigs, "".join(subseqs)
-
- sv_tig_idx += 1
-
- for sv_path in bundle_paths[1:]:
- print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
- ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
- subseqs = []
- for i in range(len(sv_path) - 1):
- v, w = sv_path[i:i+2]
- edges_to_be_removed.add( (v,w) )
- uedges = u_edges[ (v,w) ]
- uedges.sort( key= lambda x: len(x[0]) )
- subseqs.append( uedges[-1][1] )
- visited_u_edges.add( "-".join(uedges[-1][0]) )
- for ue in uedges:
- if "-".join(ue[0]) not in visited_u_edges:
- visited_u_edges.add("-".join(ue[0]))
- extra_u_edges.append(ue)
- seq = "".join(subseqs)
- if len(seq) > 0:
- print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
- print >> sv_tigs, "".join(subseqs)
- sv_tig_idx += 1
- for u_path, seq in extra_u_edges:
- #u_path = u_path.split("-")
- ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
- print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
- print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
- print >> sv_tigs, seq
- sv_tig_idx += 1
-
-
- bundle_index += 1
- else:
- #TODO, consolidate code here
- v,w = path
- uedges = u_edges[ (v,w) ]
- uedges.sort( key= lambda x: len(x[0]) )
- subseqs.append( uedges[-1][1] )
- seq = "".join(subseqs)
- print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(paths) )
- print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, path[0], path[-1])
- print >> sv_tigs, seq
- sv_tig_idx += 1
- bundle_index += 1
- bundle_graph_edges = zip(path[:-1],path[1:])
-
- #clean up the graph
-
- edges = set(G.edges())
- edges_to_be_removed |= set(bundle_graph_edges)
-
- if DEBUG_LOG_LEVEL > 2:
- print "BGE",bundle_graph_edges
-
- edge_remove_count = 0
- for v, w in edges_to_be_removed:
- if (v, w) in edges:
- G.remove_edge( v, w )
- edge_remove_count += 1
- if DEBUG_LOG_LEVEL > 2:
- print "remove edge", bundle_index, w, v
-
- edges = set(G.edges())
- for v, w in edges_to_be_removed:
-
- r_id, end = v.split(":")
- end = "E" if end == "B" else "B"
- v = r_id + ":" + end
-
- r_id, end = w.split(":")
- end = "E" if end == "B" else "B"
- w = r_id + ":" + end
-
- if (w, v) in edges:
- G.remove_edge( w, v )
- edge_remove_count += 1
- if DEBUG_LOG_LEVEL > 2:
- print "remove edge", bundle_index, w, v
-
- if edge_remove_count == 0:
- break
-
- nodes = G.nodes()
- for n in nodes:
- if G.in_degree(n) == 0 and G.out_degree(n) == 0:
- G.remove_node(n)
- if DEBUG_LOG_LEVEL > 2:
- print "remove node", n
-
- sv_tig_paths.close()
- sv_tigs.close()
- main_tig_paths.close()
- out_f.close()
- bundle_edge_out.close()
- return ASM_graph
-
-
-
-def SGToNXG(sg):
- G=nx.DiGraph()
-
- max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
- out_f = open("edges_list","w")
- for v, w in sg.edges:
- if sg.e_reduce[(v, w)] != True:
- ##if 1:
- out_degree = len(sg.nodes[v].out_edges)
- G.add_node( v, size = out_degree )
- G.add_node( w, size = out_degree )
- label = sg.edges[ (v, w) ].attr["label"]
- score = sg.edges[ (v, w) ].attr["score"]
- print >>out_f, v, w, label, score
- G.add_edge( v, w, label = label, weight = 0.001*score, n_weight = max_score - score )
- #print in_node_name, out_node_name
- out_f.close()
- return G
-
-if __name__ == "__main__":
-
- import argparse
-
- parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
- parser.add_argument('overlap_file', help='a file that contains the overlap information.')
- parser.add_argument('read_fasta', help='the file that contains the sequence to be assembled')
- parser.add_argument('--min_len', type=int, default=4000,
- help='minimum length of the reads to be considered for assembling')
- parser.add_argument('--min_idt', type=float, default=96,
- help='minimum alignment identity of the reads to be considered for assembling')
- parser.add_argument('--disable_chimer_prediction', action="store_true", default=False,
- help='you may want to disable this as some reads can be falsely identified as chimers in low coverage case')
-
- args = parser.parse_args()
-
-
- overlap_file = args.overlap_file
- read_fasta = args.read_fasta
-
- seqs = {}
- # load all p-reads into memory
- f = FastaReader(read_fasta)
- for r in f:
- seqs[r.name] = r.sequence.upper()
-
- G=nx.Graph()
- edges =set()
- overlap_data = []
- contained_reads = set()
- overlap_count = {}
-
-
- # loop through the overlapping data to load the data in the a python array
- # contained reads are identified
-
- with open(overlap_file) as f:
- for l in f:
- l = l.strip().split()
-
- #work around for some ill formed data recored
- if len(l) != 13:
- continue
-
- f_id, g_id, score, identity = l[:4]
- if f_id == g_id: # don't need self-self overlapping
- continue
-
- if g_id not in seqs:
- continue
-
- if f_id not in seqs:
- continue
-
- score = int(score)
- identity = float(identity)
- contained = l[12]
- if contained == "contained":
- contained_reads.add(f_id)
- continue
- if contained == "contains":
- contained_reads.add(g_id)
- continue
- if contained == "none":
- continue
-
- if identity < args.min_idt: # only take record with >96% identity as overlapped reads
- continue
- #if score > -2000:
- # continue
- f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
- g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
-
- # only used reads longer than the 4kb for assembly
- if f_len < args.min_len: continue
- if g_len < args.min_len: continue
-
- # double check for proper overlap
- if f_start > 24 and f_len - f_end > 24: # allow 24 base tolerance on both sides of the overlapping
- continue
-
- if g_start > 24 and g_len - g_end > 24:
- continue
-
- if g_strain == 0:
- if f_start < 24 and g_len - g_end > 24:
- continue
- if g_start < 24 and f_len - f_end > 24:
- continue
- else:
- if f_start < 24 and g_start > 24:
- continue
- if g_start < 24 and f_start > 24:
- continue
-
- overlap_data.append( (f_id, g_id, score, identity,
- f_strain, f_start, f_end, f_len,
- g_strain, g_start, g_end, g_len) )
-
- overlap_count[f_id] = overlap_count.get(f_id,0)+1
- overlap_count[g_id] = overlap_count.get(g_id,0)+1
-
- overlap_set = set()
- sg = StringGraph()
- for od in overlap_data:
- f_id, g_id, score, identity = od[:4]
- if f_id in contained_reads:
- continue
- if g_id in contained_reads:
- continue
- f_s, f_b, f_e, f_l = od[4:8]
- g_s, g_b, g_e, g_l = od[8:12]
- overlap_pair = [f_id, g_id]
- overlap_pair.sort()
- overlap_pair = tuple( overlap_pair )
- if overlap_pair in overlap_set: # don't allow duplicated records
- continue
- else:
- overlap_set.add(overlap_pair)
-
-
- if g_s == 1: # revered alignment, swapping the begin and end coordinates
- g_b, g_e = g_e, g_b
-
- # build the string graph edges for each overlap
- if f_b > 24:
- if g_b < g_e:
- """
- f.B f.E
- f ----------->
- g ------------->
- g.B g.E
- """
- if f_b == 0 or g_e - g_l == 0:
- continue
- sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
- length = abs(f_b-0),
- score = -score)
- sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_e, g_l),
- length = abs(g_e-g_l),
- score = -score)
- else:
- """
- f.B f.E
- f ----------->
- g <-------------
- g.E g.B
- """
- if f_b == 0 or g_e == 0:
- continue
- sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
- length = abs(f_b -0),
- score = -score)
- sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_e, 0),
- length = abs(g_e- 0),
- score = -score)
- else:
- if g_b < g_e:
- """
- f.B f.E
- f ----------->
- g ------------->
- g.B g.E
- """
- if g_b == 0 or f_e - f_l == 0:
- continue
- sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_b, 0),
- length = abs(g_b - 0),
- score = -score)
- sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
- length = abs(f_e-f_l),
- score = -score)
- else:
- """
- f.B f.E
- f ----------->
- g <-------------
- g.E g.B
- """
- if g_b - g_l == 0 or f_e - f_l ==0:
- continue
- sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_b, g_l),
- length = abs(g_b - g_l),
- score = -score)
- sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
- length = abs(f_e - f_l),
- score = -score)
-
-
- sg.init_reduce_dict()
- if not args.disable_chimer_prediction:
- sg.mark_chimer_edge()
- sg.mark_spur_edge()
- sg.mark_tr_edges() # mark those edges that transitive redundant
-
- if DEBUG_LOG_LEVEL > 1:
- print sum( [1 for c in sg.e_reduce.values() if c == True] )
- print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
- sg.mark_best_overlap() # mark those edges that are best overlap edges
-
- if DEBUG_LOG_LEVEL > 1:
- print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-
- G = SGToNXG(sg)
- #nx.write_gexf(G, "string_graph.gexf") # output the raw string string graph for visuliation
- nx.write_adjlist(G, "string_graph.adj") # write out the whole adjacent list of the string graph
-
- u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa") # reduct to string graph into unitig graph
- ASM_graph = get_bundles(u_edges ) # get the assembly
- #nx.write_gexf(ASM_graph, "asm_graph.gexf")
diff --git a/FALCON/src/py_scripts_v0.1/falcon_asm_s.py b/FALCON/src/py_scripts_v0.1/falcon_asm_s.py
deleted file mode 100755
index 5041881..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_asm_s.py
+++ /dev/null
@@ -1,1220 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from pbcore.io import FastaReader
-import networkx as nx
-import os
-import shlex
-import sys
-import subprocess
-
-DEBUG_LOG_LEVEL = 0
-
-class SGNode(object):
- """
- class representing a node in the string graph
- """
- def __init__(self, node_name):
- self.name = node_name
- self.out_edges = []
- self.in_edges = []
- def add_out_edge(self, out_edge):
- self.out_edges.append(out_edge)
- def add_in_edge(self, in_edge):
- self.in_edges.append(in_edge)
-
-class SGEdge(object):
- """
- class representing an edge in the string graph
- """
- def __init__(self, in_node, out_node):
- self.in_node = in_node
- self.out_node = out_node
- self.attr = {}
- def set_attribute(self, attr, value):
- self.attr[attr] = value
-
-def reverse_end( node_id ):
- node_id, end = node_id.split(":")
- new_end = "B" if end == "E" else "E"
- return node_id + ":" + new_end
-
-class StringGraph(object):
- """
- class representing the string graph
- """
- def __init__(self):
- self.nodes = {}
- self.edges = {}
- self.n_mark = {}
- self.e_reduce = {}
- self.repeat_overlap = {}
-
- def add_node(self, node_name):
- """
- add a node into the graph by given a node name
- """
- if node_name not in self.nodes:
- self.nodes[node_name] = SGNode(node_name)
-
- def add_edge(self, in_node_name, out_node_name, **attributes):
- """
- add an edge into the graph by given a pair of nodes
- """
- if (in_node_name, out_node_name) not in self.edges:
-
- self.add_node(in_node_name)
- self.add_node(out_node_name)
- in_node = self.nodes[in_node_name]
- out_node = self.nodes[out_node_name]
-
- edge = SGEdge(in_node, out_node)
- self.edges[ (in_node_name, out_node_name) ] = edge
- in_node.add_out_edge(edge)
- out_node.add_in_edge(edge)
- edge = self.edges[ (in_node_name, out_node_name) ]
- for k, v in attributes.items():
- edge.attr[k] = v
-
- def init_reduce_dict(self):
- for e in self.edges:
- self.e_reduce[e] = False
-
- def mark_chimer_edge(self):
-
- for e_n, e in self.edges.items():
- v = e_n[0]
- w = e_n[1]
- overlap_count = 0
- for w_out_e in self.nodes[w].out_edges:
- w_out_n = w_out_e.out_node.name
- if (v, w_out_n) in self.edges:
- overlap_count += 1
- for v_in_e in self.nodes[v].in_edges:
- v_in_n = v_in_e.in_node.name
- if (v_in_n, w) in self.edges:
- overlap_count += 1
- if self.e_reduce[ (v, w) ] != True:
- if overlap_count == 0:
- self.e_reduce[(v, w)] = True
- #print "XXX: chimer edge %s %s removed" % (v, w)
- v, w = reverse_end(w), reverse_end(v)
- self.e_reduce[(v, w)] = True
- #print "XXX: chimer edge %s %s removed" % (v, w)
-
-
-
- def mark_spur_edge(self):
-
- for v in self.nodes:
- if len(self.nodes[v].out_edges) > 1:
- for out_edge in self.nodes[v].out_edges:
- w = out_edge.out_node.name
-
- if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
- #print "XXX: spur edge %s %s removed" % (v, w)
- self.e_reduce[(v, w)] = True
- v2, w2 = reverse_end(w), reverse_end(v)
- #print "XXX: spur edge %s %s removed" % (v2, w2)
- self.e_reduce[(v, w)] = True
-
- if len(self.nodes[v].in_edges) > 1:
- for in_edge in self.nodes[v].in_edges:
- w = in_edge.in_node.name
- if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
- #print "XXX: spur edge %s %s removed" % (w, v)
- self.e_reduce[(w, v)] = True
- v2, w2 = reverse_end(w), reverse_end(v)
- #print "XXX: spur edge %s %s removed" % (w2, v2)
- self.e_reduce[(w, v)] = True
-
-
- def mark_tr_edges(self):
- """
- transitive reduction
- """
- n_mark = self.n_mark
- e_reduce = self.e_reduce
- FUZZ = 500
- for n in self.nodes:
- n_mark[n] = "vacant"
-
- for n_name, node in self.nodes.items():
-
- out_edges = node.out_edges
- if len(out_edges) == 0:
- continue
-
- out_edges.sort(key=lambda x: x.attr["length"])
-
- for e in out_edges:
- w = e.out_node
- n_mark[ w.name ] = "inplay"
-
- max_len = out_edges[-1].attr["length"]
-
- max_len += FUZZ
-
- for e in out_edges:
- e_len = e.attr["length"]
- w = e.out_node
- if n_mark[w.name] == "inplay":
- w.out_edges.sort( key=lambda x: x.attr["length"] )
- for e2 in w.out_edges:
- if e2.attr["length"] + e_len < max_len:
- x = e2.out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
-
- for e in out_edges:
- e_len = e.attr["length"]
- w = e.out_node
- w.out_edges.sort( key=lambda x: x.attr["length"] )
- if len(w.out_edges) > 0:
- x = w.out_edges[0].out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
- for e2 in w.out_edges:
- if e2.attr["length"] < FUZZ:
- x = e2.out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
-
- for out_edge in out_edges:
- v = out_edge.in_node
- w = out_edge.out_node
- if n_mark[w.name] == "eliminated":
- e_reduce[ (v.name, w.name) ] = True
- #print "XXX: tr edge %s %s removed" % (v.name, w.name)
- v_name, w_name = reverse_end(w.name), reverse_end(v.name)
- e_reduce[(v_name, w_name)] = True
- #print "XXX: tr edge %s %s removed" % (v_name, w_name)
- n_mark[w.name] = "vacant"
-
-
- def mark_best_overlap(self):
- """
- find the best overlapped edges
- """
-
- best_edges = set()
-
- for v in self.nodes:
-
- out_edges = self.nodes[v].out_edges
- if len(out_edges) > 0:
- out_edges.sort(key=lambda e: e.attr["score"])
- e = out_edges[-1]
- best_edges.add( (e.in_node.name, e.out_node.name) )
-
- in_edges = self.nodes[v].in_edges
- if len(in_edges) > 0:
- in_edges.sort(key=lambda e: e.attr["score"])
- e = in_edges[-1]
- best_edges.add( (e.in_node.name, e.out_node.name) )
-
- if DEBUG_LOG_LEVEL > 1:
- print "X", len(best_edges)
-
- for e_n, e in self.edges.items():
- v = e_n[0]
- w = e_n[1]
- if self.e_reduce[ (v, w) ] != True:
- if (v, w) not in best_edges:
- self.e_reduce[(v, w)] = True
- #print "XXX: in best edge %s %s removed" % (v, w)
- v2, w2 = reverse_end(w), reverse_end(v)
- #print "XXX: in best edge %s %s removed" % (v2, w2)
- self.e_reduce[(v2, w2)] = True
-
- def get_out_edges_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].out_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- return rtn
-
-
- def get_in_edges_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].in_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- return rtn
-
- def get_best_out_edge_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].out_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- rtn.sort(key=lambda e: e.attr["score"])
-
- return rtn[-1]
-
- def get_best_in_edge_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].in_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- rtn.sort(key=lambda e: e.attr["score"])
- return rtn[-1]
-
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-def generate_seq_from_path(sg, seqs, path):
- subseqs = []
- r_id, end = path[0].split(":")
-
- count = 0
- for i in range( len( path ) -1 ):
- w_n, v_n = path[i:i+2]
- edge = sg.edges[ (w_n, v_n ) ]
- read_id, coor = edge.attr["label"].split(":")
- b,e = coor.split("-")
- b = int(b)
- e = int(e)
- if b < e:
- subseqs.append( seqs[read_id][b:e] )
- else:
- subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
-
- return "".join(subseqs)
-
-
-def reverse_path( path ):
- new_path = []
- for n in list(path[::-1]):
- rid, end = n.split(":")
- new_end = "B" if end == "E" else "E"
- new_path.append( rid+":"+new_end)
- return new_path
-
-
-def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
-
- """
- given a string graph:sg and the sequences: seqs, write the unitig fasta file into out_fn
- the funtion return a reduct graph representing the reduce string graph where the edges are unitigs
-
- some extra files generated:
- unit_edges.dat : an easy to parse file for unitig data
- unit_edge_paths : the file contains the information of the path of all unitigs
- uni_graph.gexf: the unitig graph in gexf format for visulization
- """
-
- G = SGToNXG(sg)
- if connected_nodes != None:
- connected_nodes = set(sg.nodes)
- out_fasta = open(out_fn, "w")
- nodes_for_tig = set()
- sg_edges = set()
- for v, w in sg.edges:
- if sg.e_reduce[(v, w)] != True:
- sg_edges.add( (v, w) )
- count = 0
- edges_in_tigs = set()
-
- uni_edges = {}
- path_f = open("unit_edge_paths","w")
- uni_edge_f = open("unit_edges.dat", "w")
- while len(sg_edges) > 0:
- v, w = sg_edges.pop()
-
- #nodes_for_tig.remove(n)
- upstream_nodes = []
-
- c_node = v
- p_in_edges = sg.get_in_edges_for_node(c_node)
- p_out_edges = sg.get_out_edges_for_node(c_node)
- while len(p_in_edges) == 1 and len(p_out_edges) == 1:
- p_node = p_in_edges[0].in_node
- upstream_nodes.append(p_node.name)
- if (p_node.name, c_node) not in sg_edges:
- break
- p_in_edges = sg.get_in_edges_for_node(p_node.name)
- p_out_edges = sg.get_out_edges_for_node(p_node.name)
- c_node = p_node.name
-
- upstream_nodes.reverse()
-
- downstream_nodes = []
- c_node = w
- n_out_edges = sg.get_out_edges_for_node(c_node)
- n_in_edges = sg.get_in_edges_for_node(c_node)
- while len(n_out_edges) == 1 and len(n_in_edges) == 1:
- n_node = n_out_edges[0].out_node
- downstream_nodes.append(n_node.name)
- if (c_node, n_node.name) not in sg_edges:
- break
- n_out_edges = sg.get_out_edges_for_node(n_node.name)
- n_in_edges = sg.get_in_edges_for_node(n_node.name)
- c_node = n_node.name
-
- whole_path = upstream_nodes + [v, w] + downstream_nodes
- count += 1
- subseq = generate_seq_from_path(sg, seqs, whole_path)
- #subseq = ""
- uni_edges.setdefault( (whole_path[0], whole_path[-1]), [] )
- uni_edges[(whole_path[0], whole_path[-1])].append( ( whole_path, subseq ) )
- print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), subseq
- print >>path_f, ">%05dc-%s-%s-%d %s" % (count, whole_path[0], whole_path[-1], len(whole_path), " ".join(whole_path))
- print >>out_fasta, ">%05dc-%s-%s-%d" % (count, whole_path[0], whole_path[-1], len(whole_path))
- print >>out_fasta, subseq
- for i in range( len( whole_path ) -1 ):
- w_n, v_n = whole_path[i:i+2]
- try:
- sg_edges.remove( (w_n, v_n) )
- except KeyError: #if an edge is already deleted, ignore it
- pass
-
- r_whole_path = reverse_path( whole_path )
- count += 1
- subseq = generate_seq_from_path(sg, seqs, r_whole_path)
- #subseq = ""
- uni_edges.setdefault( (r_whole_path[0], r_whole_path[-1]), [] )
- uni_edges[(r_whole_path[0], r_whole_path[-1])].append( ( r_whole_path, subseq ) )
- print >> uni_edge_f, r_whole_path[0], r_whole_path[-1], "-".join(r_whole_path), subseq
- print >>path_f, ">%05dc-%s-%s-%d %s" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path), " ".join(r_whole_path))
- print >>out_fasta, ">%05dc-%s-%s-%d" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path))
- print >>out_fasta, subseq
- for i in range( len( r_whole_path ) -1 ):
- w_n, v_n = r_whole_path[i:i+2]
- try:
- sg_edges.remove( (w_n, v_n) )
- except KeyError: #if an edge is already deleted, ignore it
- pass
-
-
- path_f.close()
- uni_edge_f.close()
- #uni_graph = nx.DiGraph()
- #for n1, n2 in uni_edges.keys():
- # uni_graph.add_edge(n1, n2, count = len( uni_edges[ (n1,n2) ] ))
- #nx.write_gexf(uni_graph, "uni_graph.gexf")
-
- out_fasta.close()
- return uni_edges
-
-def neighbor_bound(G, v, w, radius):
- """
- test if the node v and the node w are connected within a radius in graph G
- """
- g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
- g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
- if len(set(g1.edges()) & set(g2.edges())) > 0:
- return True
- else:
- return False
-
-
-def is_branch_node(G, n):
- """
- test whether the node n is a "branch node" which the paths from any of two of
- its offsprings do not intersect within a given radius
- """
- out_edges = G.out_edges([n])
- n2 = [ e[1] for e in out_edges ]
- is_branch = False
- for i in range(len(n2)):
- for j in range(i+1, len(n2)):
- v = n2[i]
- w = n2[j]
- if neighbor_bound(G, v, w, 10) == False:
- is_branch = True
- break
- if is_branch == True:
- break
- return is_branch
-
-
-def get_bundle( path, u_graph, u_graph_r ):
-
- """
- find a sub-graph contain the nodes between the start and the end of the path
- inputs:
- u_graph : a unitig graph
- returns:
- bundle_graph: the whole bundle graph
- bundle_paths: the paths in the bundle graph
- sub_graph2_edges: all edges of the bundle graph
-
- """
-
- p_start, p_end = path[0], path[-1]
- p_nodes = set(path)
- p_edges = set(zip(path[:-1], path[1:]))
-
- down_path = nx.ego_graph(u_graph, p_start, radius=len(p_nodes), undirected=False)
- up_path = nx.ego_graph(u_graph_r, p_end, radius=len(p_nodes), undirected=False)
- subgraph_nodes = set(down_path) & set(up_path)
-
-
- sub_graph = nx.DiGraph()
- for v, w in u_graph.edges_iter():
- if v in subgraph_nodes and w in subgraph_nodes:
- if (v, w) in p_edges:
- sub_graph.add_edge(v, w, color = "red")
- else:
- sub_graph.add_edge(v, w, color = "black")
-
- sub_graph2 = nx.DiGraph()
- tips = set()
- tips.add(path[0])
- sub_graph_r = sub_graph.reverse()
- visited = set()
- ct = 0
- is_branch = is_branch_node(sub_graph, path[0]) #if the start node is a branch node
- if is_branch:
- n = tips.pop()
- e = sub_graph.out_edges([n])[0] #pick one path the build the subgraph
- sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
- if e[1] not in visited:
- last_node = e[1]
- visited.add(e[1])
- r_id, orientation = e[1].split(":")
- orientation = "E" if orientation == "B" else "E"
- visited.add( r_id +":" + orientation)
- if not is_branch_node(sub_graph_r, e[1]):
- tips.add(e[1])
-
- while len(tips) != 0:
- n = tips.pop()
- out_edges = sub_graph.out_edges([n])
- if len(out_edges) == 1:
- e = out_edges[0]
- sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
- last_node = e[1]
- if e[1] not in visited:
- visited.add(e[1])
- r_id, orientation = e[1].split(":")
- orientation = "E" if orientation == "B" else "E"
- visited.add( r_id +":" + orientation)
- if not is_branch_node(sub_graph_r, e[1]):
- tips.add(e[1])
- else:
-
- is_branch = is_branch_node(sub_graph, n)
- if not is_branch:
- for e in out_edges:
- sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
- last_node = e[1]
- if e[1] not in visited:
- r_id, orientation = e[1].split(":")
- visited.add(e[1])
- orientation = "E" if orientation == "B" else "E"
- visited.add( r_id +":" + orientation)
- if not is_branch_node(sub_graph_r, e[1]):
- tips.add(e[1])
- ct += 1
- last_node = None
- longest_len = 0
-
- sub_graph2_nodes = sub_graph2.nodes()
- sub_graph2_edges = sub_graph2.edges()
-
-
- new_path = [path[0]]
- for n in sub_graph2_nodes:
- if len(sub_graph2.out_edges(n)) == 0 :
- path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
- path_len = len(path_t)
- if path_len > longest_len:
- last_node = n
- longest_len = path_len
- new_path = path_t
-
- if last_node == None:
- for n in sub_graph2_nodes:
- path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
- path_len = len(path_t)
- if path_len > longest_len:
- last_node = n
- longest_len = path_len
- new_path = path_t
-
-
- path = new_path
-
- # clean up sub_graph2 according to new begin and end
- sub_graph2_r = sub_graph2.reverse()
- down_path = nx.ego_graph(sub_graph2, path[0], radius=len(path), undirected=False)
- up_path = nx.ego_graph(sub_graph2_r, path[-1], radius=len(path), undirected=False)
- subgraph_nodes = set(down_path) & set(up_path)
- for v in sub_graph2_nodes:
- if v not in subgraph_nodes:
- sub_graph2.remove_node(v)
-
- if DEBUG_LOG_LEVEL > 1:
- print "new_path", path[0], last_node, len(sub_graph2_nodes), path
-
-
- bundle_paths = [path]
- p_nodes = set(path)
- p_edges = set(zip(path[:-1], path[1:]))
-
- sub_graph2_nodes = sub_graph2.nodes()
- sub_graph2_edges = sub_graph2.edges()
-
- nodes_idx = dict( [ (n[1], n[0]) for n in enumerate(path) ] )
-
-
- # create a list of subpath that has no branch
- non_branch_subpaths = []
- wi = 0
- vi = 0
- v = path[0]
- while v != path[-1] and wi < len(path)-1:
- wi += 1
- w = path[wi]
- while len( sub_graph2.successors(w) ) == 1 and len( sub_graph2.predecessors(w) ) == 1 and wi < len(path)-1:
- wi += 1
- w = path[wi]
- if len( sub_graph2.successors(v) )!= 1 or len( sub_graph2.predecessors(w) )!= 1:
- branched = True
- else:
- branched = False
-
- if not branched:
- non_branch_subpaths.append( path[vi:wi+1] )
- v = w
- vi = wi
-
- # create the accompany_graph that has the path of the alternative subpaths
-
- associate_graph = nx.DiGraph()
- for v, w in sub_graph2.edges_iter():
- if (v, w) not in p_edges:
- associate_graph.add_edge(v, w, n_weight = sub_graph2[v][w]["n_weight"])
-
- if DEBUG_LOG_LEVEL > 1:
- print "associate_graph size:", len(associate_graph)
- print "non_branch_subpaths",len(non_branch_subpaths), non_branch_subpaths
-
- # construct the bundle graph
- associate_graph_nodes = set(associate_graph.nodes())
- bundle_graph = nx.DiGraph()
- bundle_graph.add_path( path )
- for i in range(len(non_branch_subpaths)-1):
- if len(non_branch_subpaths[i]) == 0 or len( non_branch_subpaths[i+1] ) == 0:
- continue
- e1, e2 = non_branch_subpaths[i: i+2]
- v = e1[-1]
- w = e2[0]
- if v == w:
- continue
- in_between_node_count = nodes_idx[w] - nodes_idx[v]
- if v in associate_graph_nodes and w in associate_graph_nodes:
- try:
- a_path = nx.shortest_path(associate_graph, v, w, "n_weight")
- except nx.NetworkXNoPath:
- continue
- bundle_graph.add_path( a_path )
- bundle_paths.append( a_path )
-
- return bundle_graph, bundle_paths, sub_graph2_edges
-
-def get_bundles(u_edges):
-
- """
- input: all unitig edges
- output: the assembled primary_tigs.fa and all_tigs.fa
- """
-
- ASM_graph = nx.DiGraph()
- out_f = open("primary_tigs.fa", "w")
- main_tig_paths = open("primary_tigs_paths","w")
- sv_tigs = open("all_tigs.fa","w")
- sv_tig_paths = open("all_tigs_paths","w")
- max_weight = 0
- for v, w in u_edges:
- x = max( [len(s[1]) for s in u_edges[ (v,w) ] ] )
- if DEBUG_LOG_LEVEL > 1:
- print "W", v, w, x
- if x > max_weight:
- max_weight = x
-
- in_edges = {}
- out_edges = {}
- for v, w in u_edges:
- in_edges.setdefault(w, [])
- out_edges.setdefault(w, [])
- in_edges[w].append( (v, w) )
-
- out_edges.setdefault(v, [])
- in_edges.setdefault(v, [])
- out_edges[v].append( (v, w) )
-
- u_graph = nx.DiGraph()
- for v,w in u_edges:
-
- u_graph.add_edge(v, w, n_weight = max_weight - max( [len(s[1]) for s in u_edges[ (v,w) ] ] ) )
-
- bundle_edge_out = open("bundle_edges","w")
- bundle_index = 0
-
-
- components = nx.weakly_connected_component_subgraphs(u_graph)
- components = [ (len(c), c) for c in components ]
- components.sort()
- #components.reverse()
- allS = len(u_graph)
- ssG = 0.0
- processed_overlaps = set()
- for sG, G in components:
-
- ssG += sG
- print "process graph of size ", sG, "%0.2f %0.2f" % (ssG, ssG/allS)
- G_edges = set(G.edges())
-
- dual_component = False
-
- for v, w in list(G_edges):
- v = v.split(":")[0]
- w = w.split(":")[0]
- if (v, w) in processed_overlaps:
- dual_component = True
- break
-
- if dual_component == True:
- continue
-
- for v, w in list(G_edges):
- v = v.split(":")[0]
- w = w.split(":")[0]
- processed_overlaps.add( (v,w) )
- processed_overlaps.add( (w,v) )
-
- G_r = G.reverse()
- visited_u_edges = set()
-
- while len(G) > 0:
- out_f.flush()
- main_tig_paths.flush()
- sv_tigs.flush()
- sv_tig_paths.flush()
-
-
- #root_nodes = set()
- candidates = []
- for n in G:
- sp =nx.single_source_shortest_path_length(G, n)
- sp = sp.items()
- sp.sort(key=lambda x : x[1])
- longest = sp[-1]
- if DEBUG_LOG_LEVEL > 2:
- print "L", n, longest[0]
- if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop
- continue
- candidates.append ( (longest[1], n, longest[0]) )
-
- n = longest[0]
- sp =nx.single_source_shortest_path_length(G_r, n)
- sp = sp.items()
- sp.sort(key=lambda x : x[1])
- longest = sp[-1]
- if DEBUG_LOG_LEVEL > 2:
- print "L", n, longest[0]
- if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop
- continue
- candidates.append ( (longest[1], longest[0], n) )
- if len(candidates) != 0:
- break
-
- if len(candidates) == 0:
- print "no more candiate", len(G.edges()), len(G.nodes())
- if len(G_edges) > 0:
- path = G_edges.pop()
- G_edges.add(path)
- print path
- else:
- break
- else:
- candidates.sort()
-
- candidate = candidates[-1]
-
- if candidate[1] == candidate[2]:
- G.remove_node(candidate[1])
- G_r.remove_node(candidate[1])
- continue
-
- path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight")
-
- if DEBUG_LOG_LEVEL > 1:
- print "X", path[0], path[-1], len(path)
-
- cmp_edges = set()
- #g_edges = set(G.edges())
- new_path = []
- tail = True
- # avioid confusion due to long palindrome sequence
- if len(path) > 2:
- for i in range( 0, len( path ) - 1 ):
- v_n, w_n = path[i:i+2]
- new_path.append(v_n)
- # the comment out code below might be useful for filter out some high connectivity nodes
- #if (v_n, w_n) in cmp_edges or\
- # len(u_graph.out_edges(w_n)) > 5 or\
- # len(u_graph.in_edges(w_n)) > 5:
- if (v_n, w_n) in cmp_edges:
- tail = False
- break
-
- r_id, end = v_n.split(":")
- end = "E" if end == "B" else "B"
- v_n2 = r_id + ":" + end
-
- r_id, end = w_n.split(":")
- end = "E" if end == "B" else "B"
- w_n2 = r_id + ":" + end
-
- if (w_n2, v_n2) in G_edges:
- cmp_edges.add( (w_n2, v_n2) )
-
- if tail:
- new_path.append(w_n)
- else:
- new_path = path[:]
-
-
- if len(new_path) > 1:
- path = new_path
-
- if DEBUG_LOG_LEVEL > 2:
- print "Y", path[0], path[-1], len(path)
-
- bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G, G_r )
- for bg_edge in bundle_graph_edges:
- print >> bundle_edge_out, bundle_index, "edge", bg_edge[0], bg_edge[1]
- for path_ in bundle_paths:
- print >>bundle_edge_out, "path", bundle_index, " ".join(path_)
-
- edges_to_be_removed = set()
- if DEBUG_LOG_LEVEL > 2:
- print "Z", bundle_paths[0][0], bundle_paths[0][-1]
- print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
-
- if len(bundle_graph_edges) > 0:
-
- ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
- extra_u_edges = []
-
- print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
- subseqs = []
-
- for i in range(len(bundle_paths[0]) - 1):
- v, w = bundle_paths[0][i:i+2]
- edges_to_be_removed.add( (v,w) )
- uedges = u_edges[ (v,w) ]
- uedges.sort( key= lambda x: len(x[0]) )
- subseqs.append( uedges[-1][1] )
- visited_u_edges.add( "-".join(uedges[-1][0]) )
- for ue in uedges:
- if "-".join(ue[0]) not in visited_u_edges:
- visited_u_edges.add("-".join(ue[0]))
- extra_u_edges.append(ue)
- seq = "".join(subseqs)
- sv_tig_idx = 0
- print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(bundle_paths[0]) )
- if len(seq) > 0:
- print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
- print >> out_f, seq
- print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, bundle_paths[0][0], bundle_paths[0][-1])
- print >> sv_tigs, "".join(subseqs)
-
- sv_tig_idx += 1
-
- for sv_path in bundle_paths[1:]:
- print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
- ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
- subseqs = []
- for i in range(len(sv_path) - 1):
- v, w = sv_path[i:i+2]
- edges_to_be_removed.add( (v,w) )
- uedges = u_edges[ (v,w) ]
- uedges.sort( key= lambda x: len(x[0]) )
- subseqs.append( uedges[-1][1] )
- visited_u_edges.add( "-".join(uedges[-1][0]) )
- for ue in uedges:
- if "-".join(ue[0]) not in visited_u_edges:
- visited_u_edges.add("-".join(ue[0]))
- extra_u_edges.append(ue)
- seq = "".join(subseqs)
- if len(seq) > 0:
- print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
- print >> sv_tigs, "".join(subseqs)
- sv_tig_idx += 1
- for u_path, seq in extra_u_edges:
- #u_path = u_path.split("-")
- ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
- print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
- print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
- print >> sv_tigs, seq
- sv_tig_idx += 1
-
-
- bundle_index += 1
- else:
- #TODO, consolidate code here
- v,w = path
- uedges = u_edges[ (v,w) ]
- uedges.sort( key= lambda x: len(x[0]) )
- subseqs.append( uedges[-1][1] )
- seq = "".join(subseqs)
- print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(paths) )
- print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, path[0], path[-1])
- print >> sv_tigs, seq
- sv_tig_idx += 1
- bundle_index += 1
- bundle_graph_edges = zip(path[:-1],path[1:])
-
- #clean up the graph
-
- edges = set(G.edges())
- edges_to_be_removed |= set(bundle_graph_edges)
-
- if DEBUG_LOG_LEVEL > 2:
- print "BGE",bundle_graph_edges
-
- edge_remove_count = 0
- for v, w in edges_to_be_removed:
- if (v, w) in edges:
- G.remove_edge( v, w )
- G_r.remove_edge( w, v )
- G_edges.remove( (v, w) )
- edge_remove_count += 1
- if DEBUG_LOG_LEVEL > 2:
- print "remove edge", bundle_index, w, v
-
- edges = set(G.edges())
- for v, w in edges_to_be_removed:
-
- r_id, end = v.split(":")
- end = "E" if end == "B" else "B"
- v = r_id + ":" + end
-
- r_id, end = w.split(":")
- end = "E" if end == "B" else "B"
- w = r_id + ":" + end
-
- if (w, v) in edges:
- G.remove_edge( w, v )
- G_edges.remove( (w, v) )
- G_r.remove_edge( v, w )
- edge_remove_count += 1
- if DEBUG_LOG_LEVEL > 2:
- print "remove edge", bundle_index, w, v
-
- if edge_remove_count == 0:
- break
-
- nodes = G.nodes()
- for n in nodes:
- if G.in_degree(n) == 0 and G.out_degree(n) == 0:
- G.remove_node(n)
- G_r.remove_node(n)
- if DEBUG_LOG_LEVEL > 2:
- print "remove node", n
-
- sv_tig_paths.close()
- sv_tigs.close()
- main_tig_paths.close()
- out_f.close()
- bundle_edge_out.close()
- return ASM_graph
-
-
-
-def SGToNXG(sg):
- G=nx.DiGraph()
-
- max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
- out_f = open("edges_list","w")
- for v, w in sg.edges:
- if sg.e_reduce[(v, w)] != True:
- ##if 1:
- out_degree = len(sg.nodes[v].out_edges)
- G.add_node( v, size = out_degree )
- G.add_node( w, size = out_degree )
- label = sg.edges[ (v, w) ].attr["label"]
- score = sg.edges[ (v, w) ].attr["score"]
- print >>out_f, v, w, label, score
- G.add_edge( v, w, label = label, weight = 0.001*score, n_weight = max_score - score )
- #print in_node_name, out_node_name
- out_f.close()
- return G
-
-if __name__ == "__main__":
-
- import argparse
-
- parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
- parser.add_argument('overlap_file', help='a file that contains the overlap information.')
- parser.add_argument('read_fasta', help='the file that contains the sequence to be assembled')
- parser.add_argument('--min_len', type=int, default=4000,
- help='minimum length of the reads to be considered for assembling')
- parser.add_argument('--min_idt', type=float, default=96,
- help='minimum alignment identity of the reads to be considered for assembling')
- parser.add_argument('--disable_chimer_prediction', action="store_true", default=False,
- help='you may want to disable this as some reads can be falsely identified as chimers in low coverage case')
-
- args = parser.parse_args()
-
-
- overlap_file = args.overlap_file
- read_fasta = args.read_fasta
-
- contained_reads = set()
- chimer_ids = set()
-
- with open("rc_out_all") as f:
- for l in f:
- l = l.strip().split()
- if l[1] == "2":
- chimer_ids.add(l[0])
- if l[1] == "1":
- contained_reads.add(l[0])
- print len(chimer_ids)
-
- seqs = {}
- # load all p-reads into memory
- f = FastaReader(read_fasta)
- for r in f:
- if r.name in contained_reads:
- continue
- if r.name in chimer_ids:
- continue
- seqs[r.name] = r.sequence.upper()
-
- G=nx.Graph()
- edges =set()
- overlap_data = []
- contained_reads = set()
- overlap_count = {}
-
-
- # loop through the overlapping data to load the data in the a python array
- # contained reads are identified
-
- with open(overlap_file) as f:
- for l in f:
- l = l.strip().split()
-
- #work around for some ill formed data recored
- if len(l) != 13:
- continue
-
- f_id, g_id, score, identity = l[:4]
- if f_id == g_id: # don't need self-self overlapping
- continue
-
- if g_id not in seqs:
- continue
-
- if f_id not in seqs:
- continue
-
- score = int(score)
- identity = float(identity)
- contained = l[12]
- if contained == "contained":
- contained_reads.add(f_id)
- continue
- if contained == "contains":
- contained_reads.add(g_id)
- continue
- if contained == "none":
- continue
-
- if identity < args.min_idt: # only take record with >96% identity as overlapped reads
- continue
- #if score > -2000:
- # continue
- f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
- g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
-
- # only used reads longer than the 4kb for assembly
- if f_len < args.min_len: continue
- if g_len < args.min_len: continue
-
- # double check for proper overlap
- if f_start > 24 and f_len - f_end > 24: # allow 24 base tolerance on both sides of the overlapping
- continue
-
- if g_start > 24 and g_len - g_end > 24:
- continue
-
- if g_strain == 0:
- if f_start < 24 and g_len - g_end > 24:
- continue
- if g_start < 24 and f_len - f_end > 24:
- continue
- else:
- if f_start < 24 and g_start > 24:
- continue
- if g_start < 24 and f_start > 24:
- continue
-
- overlap_data.append( (f_id, g_id, score, identity,
- f_strain, f_start, f_end, f_len,
- g_strain, g_start, g_end, g_len) )
-
- overlap_count[f_id] = overlap_count.get(f_id,0)+1
- overlap_count[g_id] = overlap_count.get(g_id,0)+1
-
- print "###", len(overlap_data), len(contained_reads)
- overlap_set = set()
- sg = StringGraph()
- for od in overlap_data:
- f_id, g_id, score, identity = od[:4]
- if f_id in contained_reads:
- continue
- if g_id in contained_reads:
- continue
- f_s, f_b, f_e, f_l = od[4:8]
- g_s, g_b, g_e, g_l = od[8:12]
- overlap_pair = [f_id, g_id]
- overlap_pair.sort()
- overlap_pair = tuple( overlap_pair )
- if overlap_pair in overlap_set: # don't allow duplicated records
- continue
- else:
- overlap_set.add(overlap_pair)
-
-
- if g_s == 1: # revered alignment, swapping the begin and end coordinates
- g_b, g_e = g_e, g_b
-
- # build the string graph edges for each overlap
- if f_b > 24:
- if g_b < g_e:
- """
- f.B f.E
- f ----------->
- g ------------->
- g.B g.E
- """
- if f_b == 0 or g_e - g_l == 0:
- continue
- sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
- length = abs(f_b-0),
- score = -score)
- sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_e, g_l),
- length = abs(g_e-g_l),
- score = -score)
- else:
- """
- f.B f.E
- f ----------->
- g <-------------
- g.E g.B
- """
- if f_b == 0 or g_e == 0:
- continue
- sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0),
- length = abs(f_b -0),
- score = -score)
- sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_e, 0),
- length = abs(g_e- 0),
- score = -score)
- else:
- if g_b < g_e:
- """
- f.B f.E
- f ----------->
- g ------------->
- g.B g.E
- """
- if g_b == 0 or f_e - f_l == 0:
- continue
- sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_b, 0),
- length = abs(g_b - 0),
- score = -score)
- sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
- length = abs(f_e-f_l),
- score = -score)
- else:
- """
- f.B f.E
- f ----------->
- g <-------------
- g.E g.B
- """
- if g_b - g_l == 0 or f_e - f_l ==0:
- continue
- sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_b, g_l),
- length = abs(g_b - g_l),
- score = -score)
- sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l),
- length = abs(f_e - f_l),
- score = -score)
-
-
- sg.init_reduce_dict()
- #if not args.disable_chimer_prediction:
- # sg.mark_chimer_edge()
- sg.mark_spur_edge()
- sg.mark_tr_edges() # mark those edges that transitive redundant
-
- #if DEBUG_LOG_LEVEL > 1:
- if 1:
- print sum( [1 for c in sg.e_reduce.values() if c == True] )
- print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
- sg.mark_best_overlap() # mark those edges that are best overlap edges
-
- if DEBUG_LOG_LEVEL > 1:
- print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
-
- G = SGToNXG(sg)
- nx.write_gexf(G, "string_graph.gexf") # output the raw string string graph for visuliation
- nx.write_adjlist(G, "string_graph.adj") # write out the whole adjacent list of the string graph
-
- u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa") # reduct to string graph into unitig graph
- ASM_graph = get_bundles(u_edges ) # get the assembly
- nx.write_gexf(ASM_graph, "asm_graph.gexf")
diff --git a/FALCON/src/py_scripts_v0.1/falcon_dedup.py b/FALCON/src/py_scripts_v0.1/falcon_dedup.py
deleted file mode 100644
index b574fad..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_dedup.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import subprocess
-from pbcore.io import FastaReader
-
-def get_matches(seq0, seq1):
- with open("tmp_seq0.fa","w") as f:
- print >>f, ">seq0"
- print >>f, seq0
- with open("tmp_seq1.fa","w") as f:
- print >>f, ">seq1"
- print >>f, seq1
- mgaps_out=subprocess.check_output("mummer -maxmatch -c -b -l 24 tmp_seq0.fa tmp_seq1.fa | mgaps ", stderr = open("/dev/null"), shell=True)
-
- matches = []
- cluster = []
- for l in mgaps_out.split("\n"):
- l = l.strip().split()
- if len(l) == 0:
- continue
- if l[0] == ">":
- seq_id = l[1]
-
- if len(cluster) != 0:
- matches.append(cluster)
-
- cluster = []
- continue
- if l[0] == "#":
- if len(cluster) != 0:
- matches.append(cluster)
- cluster = []
- continue
- len_ = int(l[2])
- r_s = int(l[0])
- q_s = int(l[1])
- r_e = r_s + len_
- q_e = q_s + len_
- cluster.append( ((r_s, r_e), (q_s, q_e)) )
- if len(cluster) != 0:
- matches.append(cluster)
- return matches
-
-
-u_edges = {}
-with open("./unit_edges.dat") as f:
- for l in f:
- v, w, path, seq = l.strip().split()
- u_edges.setdefault( (v, w), [] )
- u_edges[ (v, w) ].append( (path, seq) )
-
-
-p_tig_path = {}
-a_tig_path = {}
-with open("primary_tigs_paths_c") as f:
- for l in f:
- l = l.strip().split()
- id_ = l[0][1:]
- path = l[1:]
- p_tig_path[id_] = path
-
-with open("all_tigs_paths") as f:
- for l in f:
- l = l.strip().split()
- id_ = l[0][1:]
- path = l[1:]
- a_tig_path[id_] = path
-
-p_tig_seqs = {}
-for r in FastaReader("primary_tigs_c.fa"):
- p_tig_seqs[r.name] = r.sequence
-
-a_tig_seqs = {}
-for r in FastaReader("all_tigs.fa"):
- a_tig_seqs[r.name.split()[0]] = r.sequence
-
-p_tig_to_node_pos = {}
-node_pos = []
-with open("primary_tigs_node_pos_c") as f:
- for l in f:
- l = l.strip().split()
- p_tig_to_node_pos.setdefault( l[0], [])
- p_tig_to_node_pos[l[0]].append( (l[1], int(l[2])))
-
-duplicate_a_tigs = []
-with open("a_nodup.fa","w") as out_f:
- for p_tig_id in p_tig_path:
- main_path = p_tig_path[p_tig_id]
- main_path_nodes = set(main_path[:])
- p_tig_seq = p_tig_seqs[p_tig_id]
- a_node = []
- a_node_range = []
- a_node_range_map = {}
- node_to_pos = dict( p_tig_to_node_pos[p_tig_id] )
- for id_ in a_tig_path:
- if id_[:4] != p_tig_id[:4]:
- continue
- if id_.split("-")[1] == "0000":
- continue
-
- a_path = a_tig_path[id_]
- if a_path[0] in main_path_nodes and a_path[-1] in main_path_nodes:
- #print p_tig_id, id_, a_path[0], a_path[-1]
- s, e = node_to_pos[a_path[0]], node_to_pos[a_path[-1]]
- p_seq = p_tig_seq[s:e]
- a_seq = a_tig_seqs[id_]
- seq_match = get_matches(p_seq, a_seq)
- if len(seq_match) > 1:
- print >>out_f, ">"+id_
- print >>out_f, a_seq
- continue
- try:
- r_s, r_e = seq_match[0][0][0][0], seq_match[0][-1][0][1]
- except:
- print "XXX", seq_match
- if 1.0* (r_e - r_s) / (e - s) > 98:
- print >>out_f, ">"+id_
- print >>out_f, a_seq
- continue
- duplicate_a_tigs.append(id_)
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_fixasm.py b/FALCON/src/py_scripts_v0.1/falcon_fixasm.py
deleted file mode 100644
index 525d5be..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_fixasm.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import networkx as nx
-from pbcore.io import FastaReader
-
-def neighbor_bound(G, v, w, radius):
- g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
- g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
- if len(g1) < radius or len(g2) < radius:
- return True
- print v, len(g1), w, len(g2), radius
- if len(set(g1.edges()) & set(g2.edges())) > 0:
- return True
- else:
- return False
-
-def is_branch_node(G, n):
- out_edges = G.out_edges([n])
- n2 = [ e[1] for e in out_edges ]
- is_branch = False
- for i in range(len(n2)):
- for j in range(i+1, len(n2)):
- v = n2[i]
- w = n2[j]
- if neighbor_bound(G, v, w, 20) == False:
- is_branch = True
- break
- if is_branch == True:
- break
- return is_branch
-
-
-def get_r_path(r_edges, u_path):
- tiling_path = []
- pos = 0
-
- for i in range( len(u_path) - 1):
- v, w = u_path[i:i+2]
- r_edge_label, overlap = r_edges[ (v, w) ]
- r_edge_seq_id, range_ = r_edge_label.split(":")
- range_ = range_.split("-")
- s, e = int(range_[0]), int(range_[1])
- pos += abs(e-s)
- tiling_path.append( (pos, w, s, e) )
- return tiling_path
-
-def get_seq(u_edges, r_edges, path):
- subseqs = []
- pos = []
- cur_pos = 0
- full_tiling_path = []
-
- for i in range( len(path) - 1):
- v, w = path[i:i+2]
- pos.append( (v, cur_pos) )
- uedges = u_edges[ (v, w) ]
- uedges.sort( key= lambda x: len(x[0]) )
- subseqs.append( uedges[-1][1] )
- r_path = get_r_path( r_edges, uedges[-1][0].split("-") )
- r_path = [ ( x[0] + cur_pos, x[1], x[2], x[3]) for x in r_path ]
- full_tiling_path.extend( r_path )
- cur_pos += len( uedges[-1][1] )
- pos.append( (w, cur_pos) )
- return "".join(subseqs), pos, full_tiling_path
-
-
-u_edges = {}
-with open("unit_edges.dat") as f:
- for l in f:
- v, w, path, seq = l.strip().split()
- u_edges.setdefault( (v, w), [] )
- u_edges[ (v, w) ].append( (path, seq) )
-len(u_edges)
-
-
-r_edges = {}
-with open("edges_list") as f:
- for l in f:
- v, w, edge_label, overlap = l.strip().split()
- r_edges[ (v, w) ] = (edge_label, int(overlap) )
-
-
-primary_tigs_path = {}
-primary_path_graph = nx.DiGraph()
-begin_nodes = {}
-end_nodes ={}
-with open("primary_tigs_paths") as f:
- for l in f:
- l = l.strip().split()
- name = l[0][1:]
- path = l[1:]
- primary_tigs_path[name] = path
- if len(path) < 3:
- continue
- for i in range(len(path)-1):
- n1 = path[i].split(":")[0]
- n2 = path[i+1].split(":")[0]
- primary_path_graph.add_edge( n1, n2)
- begin_nodes.setdefault(path[0], [])
- begin_nodes[path[0]].append( name )
- end_nodes.setdefault(path[-1], [])
- end_nodes[path[-1]].append( name )
-
-
-
-path_names = primary_tigs_path.keys()
-path_names.sort()
-primary_path_graph_r = primary_path_graph.reverse()
-path_f = open("primary_tigs_paths_c","w")
-pos_f = open("primary_tigs_node_pos_c", "w")
-tiling_path_f = open("all_tiling_path_c", "w")
-with open("primary_tigs_c.fa","w") as out_f:
- for name in path_names:
- sub_idx = 0
- c_path = [ primary_tigs_path[name][0] ]
- for v in primary_tigs_path[name][1:]:
- break_path = False
-
- vn = v.split(":")[0]
-
- if primary_path_graph.out_degree(vn) > 1:
- break_path = is_branch_node(primary_path_graph, vn)
- if primary_path_graph.in_degree(vn) > 1:
- break_path = is_branch_node(primary_path_graph_r, vn)
- if break_path:
- c_path.append(v)
- seq, pos, full_tiling_path = get_seq(u_edges, r_edges, c_path)
- for p, w, s, e in full_tiling_path:
- print >> tiling_path_f, "%s_%02d" % (name, sub_idx), p, w, s, e
- #if len(full_tiling_path) <= 5:
- # continue
- print >>out_f, ">%s_%02d" % (name, sub_idx)
- print >>out_f, seq
- print >>path_f, ">%s_%02d" % (name, sub_idx), " ".join(c_path)
- #print c_path
- for node, p in pos:
- print >> pos_f, "%s_%02d %s %d" % (name, sub_idx, node, p)
- c_path = [v]
- sub_idx += 1
- else:
- c_path.append(v)
-
- if len(c_path) > 1:
- seq, pos, full_tiling_path = get_seq(u_edges, r_edges, c_path)
- for p, w, s, e in full_tiling_path:
- print >> tiling_path_f, "%s_%02d" % (name, sub_idx), p, w, s, e
- if len(full_tiling_path) <= 5:
- continue
- print >>out_f, ">%s_%02d" % (name, sub_idx)
- print >>out_f, seq
- print >>path_f, ">%s_%02d" % (name, sub_idx), " ".join(c_path)
- for node, p in pos:
- print >> pos_f, "%s_%02d %s %d" % (name, sub_idx, node, p)
-
-with open("all_tigs_paths") as f:
- for l in f:
- l = l.strip().split()
- name = l[0][1:]
- name = name.split("-")
- if name[1] == "0000":
- continue
- if len(name) == 2:
- path = l[1:]
- seq, pos, full_tiling_path = get_seq(u_edges, r_edges, path)
- for p, w, s, e in full_tiling_path:
- print >> tiling_path_f, "%s" % ("-".join(name)), p, w, s, e
- else:
- path = l[1:]
- full_tiling_path = get_r_path(r_edges, path)
- for p, w, s, e in full_tiling_path:
- print >> tiling_path_f, "%s" % ("-".join(name)), p, w, s, e
-
-
-path_f.close()
-tiling_path_f.close()
-pos_f.close()
diff --git a/FALCON/src/py_scripts_v0.1/falcon_overlap.py b/FALCON/src/py_scripts_v0.1/falcon_overlap.py
deleted file mode 100755
index 1ab5a99..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_overlap.py
+++ /dev/null
@@ -1,328 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs, seqs
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-def get_ovelap_alignment(seq1, seq0):
-
- K = 8
- lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
- sa_ptr = kup.allocate_seq( len(seq0) )
- sda_ptr = kup.allocate_seq_addr( len(seq0) )
- kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
- #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
- aln_range = aln_range_ptr[0]
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
- e1 += K + K/2
- e0 += K + K/2
- kup.free_aln_range(aln_range)
- len_1 = len(seq1)
- len_0 = len(seq0)
- if e1 > len_1:
- e1 = len_1
- if e0 > len_0:
- e0 = len_0
- do_aln = False
- contain_status = "none"
- #print s0, e0, s1, e1
- if e1 - s1 > 500:
- if s0 < s1 and s0 > 24:
- do_aln = False
- elif s1 <= s0 and s1 > 24:
- do_aln = False
- elif s1 < 24 and len_1 - e1 < 24:
- do_aln = True
- contain_status = "contains"
- #print "X1"
- elif s0 < 24 and len_0 - e0 < 24:
- do_aln = True
- contain_status = "contained"
- #print "X2"
- else:
- do_aln = True
- if s0 < s1:
- s1 -= s0 #assert s1 > 0
- s0 = 0
- e1 = len_1
- #if len_1 - s1 >= len_0:
- # do_aln = False
- # contain_status = "contains"
- # print "X3", s0, e0, len_0, s1, e1, len_1
-
-
- elif s1 <= s0:
- s0 -= s1 #assert s1 > 0
- s1 = 0
- e0 = len_0
- #print s0, e0, s1, e1
- #if len_0 - s0 >= len_1:
- # do_aln = False
- # contain_status = "contained"
- # print "X4"
- #if abs( (e1 - s1) - (e0 - s0 ) ) > 200: #avoid overlap alignment for big indels
- # do_aln = False
-
- if do_aln:
- alignment = DWA.align(seq1[s1:e1], e1-s1,
- seq0[s0:e0], e0-s0,
- 500, 0)
- #print seq1[s1:e1]
- #print seq0[s2:e2]
- #if alignment[0].aln_str_size > 500:
-
- #aln_str1 = alignment[0].q_aln_str
- #aln_str0 = alignment[0].t_aln_str
- aln_size = alignment[0].aln_str_size
- aln_dist = alignment[0].dist
- aln_q_s = alignment[0].aln_q_s
- aln_q_e = alignment[0].aln_q_e
- aln_t_s = alignment[0].aln_t_s
- aln_t_e = alignment[0].aln_t_e
- assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
- #print aln_str1
- #print aln_str0
- if aln_size > 500 and contain_status == "none":
- contain_status = "overlap"
- DWA.free_alignment(alignment)
-
- kup.free_seq_addr_array(sda_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_kmer_lookup(lk_ptr)
-
- if do_aln:
- if s1 > 1000 and s0 > 1000:
- return 0, 0, 0, 0, 0, 0, "none"
- if len_1 - (s1+aln_q_e-aln_q_s) > 1000 and len_0 - (s0+aln_t_e-aln_t_s) > 1000:
- return 0, 0, 0, 0, 0, 0, "none"
-
-
-
-
- if e1 - s1 > 500 and do_aln and aln_size > 500:
- #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
- return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
- else:
- return 0, 0, 0, 0, 0, 0, contain_status
-
-def get_candidate_aln(hit_input):
-
- global q_seqs
- q_name, hit_index_f, hit_index_r = hit_input
- q_seq = q_seqs[q_name]
-
- rtn = []
-
- hit_index = hit_index_f
- c = collections.Counter(hit_index)
- s = [c[0] for c in c.items() if c[1] >50]
- #s.sort()
- targets = set()
- for p in s:
- hit_id = seqs[p][0]
- if hit_id in targets or hit_id == q_name:
- continue
- targets.add(hit_id)
- seq1, seq0 = q_seq, q_seqs[hit_id]
- aln_data = get_ovelap_alignment(seq1, seq0)
- #rtn = get_alignment(seq1, seq0)
- if rtn != None:
-
- s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
- #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0), aln_size, aln_dist
- rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
- 0, s2, e2, len(seq0),
- 0, s1, e1, len(seq1), c_status ) )
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- hit_index = hit_index_r
- c = collections.Counter(hit_index)
- s = [c[0] for c in c.items() if c[1] >50]
- #s.sort()
- targets = set()
- for p in s:
- hit_id = seqs[p][0]
- if hit_id in targets or hit_id == q_name:
- continue
- targets.add(hit_id)
- seq1, seq0 = r_q_seq, q_seqs[hit_id]
- aln_data = get_ovelap_alignment(seq1, seq0)
- #rtn = get_alignment(seq1, seq0)
- if rtn != None:
- s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
- #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0), aln_size, aln_dist
- rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
- 0, s2, e2, len(seq0),
- 1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status ) )
-
- return rtn
-
-def build_look_up(seqs, K):
- global sa_ptr, sda_ptr, lk_ptr
-
- total_index_base = len(seqs) * 1000
- sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- kup.init_seq_array(c_sa_ptr, total_index_base)
-
- sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
- lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
- kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
- start = 0
- for r_name, seq in seqs:
- kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
- start += 1000
-
- kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 512)
-
- #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
- global sa_ptr, sda_ptr, lk_ptr
- global q_seqs
-
- K = 14
- q_seq = q_seqs[q_name]
-
- rtn = []
-
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
- return q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
- for q_name, q_seq in q_seqs.items():
- yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
- for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
- yield mr
-
-
-if __name__ == "__main__":
- import argparse
- parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
- parser.add_argument('fasta_file', help='a fasta file for all pairwise overlapping of the reads within')
- parser.add_argument('--min_len', type=int, default=4000,
- help='minimum length of the reads to be considered for overlapping')
- parser.add_argument('--n_core', type=int, default=1,
- help='number of processes used for detailed overlapping evalution')
- parser.add_argument('--d_core', type=int, default=1,
- help='number of processes used for k-mer matching')
-
-
- args = parser.parse_args()
-
- seqs = []
- q_seqs = {}
- f = FastaReader(args.fasta_file) # take one commnad line argument of the input fasta file name
-
- if args.min_len < 2200:
- args.min_len = 2200
-
- idx = 0
- for r in f:
- if len(r.sequence) < args.min_len:
- continue
- seq = r.sequence.upper()
- for start in range(0, len(seq), 1000):
- if start+1000 > len(seq):
- break
- seqs.append( (r.name, seq[start: start+1000]) )
- idx += 1
-
- #seqs.append( (r.name, seq[:1000]) )
- seqs.append( (r.name, seq[-1000:]) )
- idx += 1
-
- q_seqs[r.name] = seq
-
-
- total_index_base = len(seqs) * 1000
- pool = mp.Pool(args.n_core)
- K = 14
- build_look_up(seqs, K)
- m_pool = mp.Pool(args.d_core)
-
-
- #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
- for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
- for h in r:
- print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_overlap2.py b/FALCON/src/py_scripts_v0.1/falcon_overlap2.py
deleted file mode 100755
index a8f632c..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_overlap2.py
+++ /dev/null
@@ -1,337 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs,t_seqs, seqs
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-def get_ovelap_alignment(seq1, seq0):
-
- K = 8
- lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
- sa_ptr = kup.allocate_seq( len(seq0) )
- sda_ptr = kup.allocate_seq_addr( len(seq0) )
- kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
- #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
- aln_range = aln_range_ptr[0]
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
- e1 += K + K/2
- e0 += K + K/2
- kup.free_aln_range(aln_range)
- len_1 = len(seq1)
- len_0 = len(seq0)
- if e1 > len_1:
- e1 = len_1
- if e0 > len_0:
- e0 = len_0
- do_aln = False
- contain_status = "none"
- #print s0, e0, s1, e1
- if e1 - s1 > 500:
- if s0 < s1 and s0 > 24:
- do_aln = False
- elif s1 <= s0 and s1 > 24:
- do_aln = False
- elif s1 < 24 and len_1 - e1 < 24:
- do_aln = True
- contain_status = "contains"
- #print "X1"
- elif s0 < 24 and len_0 - e0 < 24:
- do_aln = True
- contain_status = "contained"
- #print "X2"
- else:
- do_aln = True
- if s0 < s1:
- s1 -= s0 #assert s1 > 0
- s0 = 0
- e1 = len_1
- #if len_1 - s1 >= len_0:
- # do_aln = False
- # contain_status = "contains"
- # print "X3", s0, e0, len_0, s1, e1, len_1
-
-
- elif s1 <= s0:
- s0 -= s1 #assert s1 > 0
- s1 = 0
- e0 = len_0
- #print s0, e0, s1, e1
- #if len_0 - s0 >= len_1:
- # do_aln = False
- # contain_status = "contained"
- # print "X4"
- #if abs( (e1 - s1) - (e0 - s0 ) ) > 200: #avoid overlap alignment for big indels
- # do_aln = False
-
- if do_aln:
- alignment = DWA.align(seq1[s1:e1], e1-s1,
- seq0[s0:e0], e0-s0,
- 500, 0)
- #print seq1[s1:e1]
- #print seq0[s2:e2]
- #if alignment[0].aln_str_size > 500:
-
- #aln_str1 = alignment[0].q_aln_str
- #aln_str0 = alignment[0].t_aln_str
- aln_size = alignment[0].aln_str_size
- aln_dist = alignment[0].dist
- aln_q_s = alignment[0].aln_q_s
- aln_q_e = alignment[0].aln_q_e
- aln_t_s = alignment[0].aln_t_s
- aln_t_e = alignment[0].aln_t_e
- assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
- #print aln_str1
- #print aln_str0
- if aln_size > 500 and contain_status == "none":
- contain_status = "overlap"
- DWA.free_alignment(alignment)
-
- kup.free_seq_addr_array(sda_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_kmer_lookup(lk_ptr)
-
- if do_aln:
- if s1 > 1000 and s0 > 1000:
- return 0, 0, 0, 0, 0, 0, "none"
- if len_1 - (s1+aln_q_e-aln_q_s) > 1000 and len_0 - (s0+aln_t_e-aln_t_s) > 1000:
- return 0, 0, 0, 0, 0, 0, "none"
-
- if e1 - s1 > 500 and do_aln and aln_size > 500:
- #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
- return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
- else:
- return 0, 0, 0, 0, 0, 0, contain_status
-
-def get_candidate_aln(hit_input):
-
- global q_seqs, seqs, t_seqs
- q_name, hit_index_f, hit_index_r = hit_input
- q_seq = q_seqs[q_name]
-
- rtn = []
-
- hit_index = hit_index_f
- c = collections.Counter(hit_index)
- s = [c[0] for c in c.items() if c[1] >50]
- #s.sort()
- targets = set()
- for p in s:
- hit_id = seqs[p][0]
- if hit_id in targets or hit_id == q_name:
- continue
- targets.add(hit_id)
- seq1, seq0 = q_seq, t_seqs[hit_id]
- aln_data = get_ovelap_alignment(seq1, seq0)
- #rtn = get_alignment(seq1, seq0)
- if rtn != None:
-
- s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
- if c_status == "none":
- continue
- #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0), aln_size, aln_dist
- rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
- 0, s2, e2, len(seq0),
- 0, s1, e1, len(seq1), c_status ) )
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- hit_index = hit_index_r
- c = collections.Counter(hit_index)
- s = [c[0] for c in c.items() if c[1] >50]
- #s.sort()
- targets = set()
- for p in s:
- hit_id = seqs[p][0]
- if hit_id in targets or hit_id == q_name:
- continue
- targets.add(hit_id)
- seq1, seq0 = r_q_seq, t_seqs[hit_id]
- aln_data = get_ovelap_alignment(seq1, seq0)
- #rtn = get_alignment(seq1, seq0)
- if rtn != None:
- s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
- if c_status == "none":
- continue
- #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0), aln_size, aln_dist
- rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)),
- 0, s2, e2, len(seq0),
- 1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status ) )
-
- return rtn
-
-def build_look_up(seqs, K):
- global sa_ptr, sda_ptr, lk_ptr
-
- total_index_base = len(seqs) * 1000
- sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- kup.init_seq_array(c_sa_ptr, total_index_base)
-
- sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
- lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
- kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
- start = 0
- for r_name, seq in seqs:
- kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
- start += 1000
-
- kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 256)
-
- #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
- global sa_ptr, sda_ptr, lk_ptr
- global q_seqs
-
- K = 14
- q_seq = q_seqs[q_name]
-
- rtn = []
-
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
- return q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
- for q_name, q_seq in q_seqs.items():
- yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
- for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
- yield mr
-
-
-if __name__ == "__main__":
- import argparse
- parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
- parser.add_argument('query_fa', help='a fasta file to be overlapped with sequence in target')
- parser.add_argument('target_fa', help='a fasta file as the target sequences for overlapping')
- parser.add_argument('--min_len', type=int, default=4000,
- help='minimum length of the reads to be considered for overlapping')
- parser.add_argument('--n_core', type=int, default=1,
- help='number of processes used for detailed overlapping evalution')
- parser.add_argument('--d_core', type=int, default=1,
- help='number of processes used for k-mer matching')
-
-
- args = parser.parse_args()
-
- seqs = []
- q_seqs = {}
- t_seqs = {}
- f = FastaReader(args.target_fa) # take one commnad line argument of the input fasta file name
-
- if args.min_len < 2200:
- args.min_len = 2200
-
- idx = 0
- for r in f:
- if len(r.sequence) < args.min_len:
- continue
- seq = r.sequence.upper()
- for start in range(0, len(seq), 1000):
- if start+1000 > len(seq):
- break
- seqs.append( (r.name, seq[start: start+1000]) )
- idx += 1
-
- seqs.append( (r.name, seq[-1000:]) )
- idx += 1
-
- t_seqs[r.name] = seq
-
- f = FastaReader(args.query_fa) # take one commnad line argument of the input fasta file name
- for r in f:
- if len(r.sequence) < args.min_len:
- continue
- seq = r.sequence.upper()
- q_seqs[r.name] = seq
-
-
- total_index_base = len(seqs) * 1000
- pool = mp.Pool(args.n_core)
- K = 14
- build_look_up(seqs, K)
- m_pool = mp.Pool(args.d_core)
-
-
- #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
- for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
- for h in r:
- print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_qrm.py b/FALCON/src/py_scripts_v0.1/falcon_qrm.py
deleted file mode 100755
index 805fcc6..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_qrm.py
+++ /dev/null
@@ -1,370 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-import math
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs,t_seqs, seqs
-global n_candidates, max_candidates
-
-seqs = []
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-all_fivemers = []
-cmap = {0:"A", 1:"T", 2:"C", 3:"G"}
-for i in range(1024):
- mer = []
- for j in range(5):
- mer.append( cmap[ i >> (2 *j) & 3 ])
- all_fivemers.append("".join(mer))
-
-def fivemer_entropy(seq):
- five_mer_count = {}
-
- for i in range(len(seq)-5):
- five_mer = seq[i:i+5]
- five_mer_count.setdefault(five_mer, 0)
- five_mer_count[five_mer] += 1
-
- entropy = 0.0
- for five_mer in all_fivemers:
- p = five_mer_count.get(five_mer, 0) + 1.0
- p /= len(seq)
- entropy += - p * math.log(p)
-
- return entropy
-
-def get_alignment(seq1, seq0):
-
- K = 8
- lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
- sa_ptr = kup.allocate_seq( len(seq0) )
- sda_ptr = kup.allocate_seq_addr( len(seq0) )
- kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
- kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
- kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
- #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
- aln_range = aln_range_ptr[0]
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score
- e1 += K + K/2
- e0 += K + K/2
- kup.free_aln_range(aln_range)
- len_1 = len(seq1)
- len_0 = len(seq0)
- if e1 > len_1:
- e1 = len_1
- if e0 > len_0:
- e0 = len_0
-
- aln_size = 1
- if e1 - s1 > 500:
-
- aln_size = max( e1-s1, e0-s0 )
- aln_score = int(km_score * 48)
- aln_q_s = s1
- aln_q_e = e1
- aln_t_s = s0
- aln_t_e = e0
-
- kup.free_seq_addr_array(sda_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_kmer_lookup(lk_ptr)
-
- if s1 > 1000 and s0 > 1000:
- return 0, 0, 0, 0, 0, 0, "none"
-
- if len_1 - e1 > 1000 and len_0 - e0 > 1000:
- return 0, 0, 0, 0, 0, 0, "none"
-
-
- if e1 - s1 > 500 and aln_size > 500:
- return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
- else:
- return 0, 0, 0, 0, 0, 0, "none"
-
-def get_candidate_aln(hit_input):
-
- global q_seqs, seqs, t_seqs, q_len
- global max_candidates
- global n_candidates
- q_name, hit_index_f, hit_index_r = hit_input
- q_seq = q_seqs[q_name]
-
- rtn = []
- hit_index = hit_index_f
- c = collections.Counter(hit_index)
- s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
-
- hit_data = {}
- #hit_ids = set()
-
- for p, hit_count in s:
- hit_id = seqs[p][0]
- hit_data.setdefault(hit_id, [0, 0 ,0])
- hit_data[hit_id][0] += hit_count;
- if hit_count > hit_data[hit_id][1]:
- hit_data[hit_id][1] = hit_count
- hit_data[hit_id][2] += 1
-
- hit_data = hit_data.items()
-
- hit_data.sort( key=lambda x:-x[1][0] )
-
- target_count = {}
- total_hit = 0
-
- for hit in hit_data[:n_candidates]:
- hit_id = hit[0]
- hit_count = hit[1][0]
- target_count.setdefault(hit_id, 0)
- if target_count[hit_id] > max_candidates:
- continue
- if total_hit > max_candidates:
- continue
- seq1, seq0 = q_seq, t_seqs[hit_id]
- aln_data = get_alignment(seq1, seq0)
- if rtn != None:
-
- s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
- if c_status == "none":
- continue
- target_count[hit_id] += 1
- total_hit += 1
- rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
- 0, s1, e1, len(seq1),
- 0, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- hit_index = hit_index_r
- c = collections.Counter(hit_index)
- s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
-
- hit_data = {}
- #hit_ids = set()
-
- for p, hit_count in s:
- hit_id = seqs[p][0]
- hit_data.setdefault(hit_id, [0, 0 ,0])
- hit_data[hit_id][0] += hit_count;
- if hit_count > hit_data[hit_id][1]:
- hit_data[hit_id][1] = hit_count
- hit_data[hit_id][2] += 1
-
- hit_data = hit_data.items()
-
- hit_data.sort( key=lambda x:-x[1][0] )
-
-
- target_count = {}
- total_hit = 0
-
- for hit in hit_data[:n_candidates]:
- hit_id = hit[0]
- hit_count = hit[1][0]
- target_count.setdefault(hit_id, 0)
- if target_count[hit_id] > max_candidates:
- continue
- if total_hit > max_candidates:
- continue
- seq1, seq0 = r_q_seq, t_seqs[hit_id]
- aln_data = get_alignment(seq1, seq0)
- if rtn != None:
- s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
- if c_status == "none":
- continue
- target_count[hit_id] += 1
- total_hit += 1
- rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
- 0, len(seq1) - e1, len(seq1) - s1, len(seq1),
- 1, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
- return rtn
-
-def build_look_up(seqs, K):
- global sa_ptr, sda_ptr, lk_ptr
-
- total_index_base = len(seqs) * 1000
- sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- kup.init_seq_array(c_sa_ptr, total_index_base)
-
- sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
- lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
- kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
- start = 0
- for r_name, seq in seqs:
- kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
- start += 1000
-
- kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 1024)
-
- #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
- global sa_ptr, sda_ptr, lk_ptr
- global q_seqs
-
- K = 14
- q_seq = q_seqs[q_name]
-
- rtn = []
-
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
- return q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
- for q_name, q_seq in q_seqs.items():
- yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
- for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
- yield mr
-
-
-if __name__ == "__main__":
- import argparse
- parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
- parser.add_argument('target_fofn', help='a fasta fofn as the target sequences for overlapping')
- parser.add_argument('query_fofn', help='a fasta fofn to be overlapped with sequence in target')
- parser.add_argument('--min_len', type=int, default=4000,
- help='minimum length of the reads to be considered for overlapping')
- parser.add_argument('--n_core', type=int, default=1,
- help='number of processes used for detailed overlapping evalution')
- parser.add_argument('--d_core', type=int, default=1,
- help='number of processes used for k-mer matching')
- parser.add_argument('--n_candidates', type=int, default=128,
- help='number of candidates for read matching')
- parser.add_argument('--max_candidates', type=int, default=64,
- help='max number for read matching to output')
-
-
-
- args = parser.parse_args()
-
- max_candidates = args.max_candidates
- n_candidates = args.n_candidates
-
- q_seqs = {}
- t_seqs = {}
- if args.min_len < 1200:
- args.min_len = 1200
-
- with open(args.target_fofn) as fofn:
- for fn in fofn:
- fn = fn.strip()
- f = FastaReader(fn) # take one commnad line argument of the input fasta file name
- for r in f:
- if len(r.sequence) < args.min_len:
- continue
- seq = r.sequence.upper()
- for start in range(0, len(seq), 1000):
- if start+1000 > len(seq):
- break
- subseq = seq[start: start+1000]
- #if fivemer_entropy(subseq) < 4:
- # continue
- seqs.append( (r.name, subseq) )
- subseq = seq[-1000:]
- #if fivemer_entropy(subseq) < 4:
- # continue
- #seqs.append( (r.name, seq[:1000]) )
- seqs.append( (r.name, subseq) )
-
- t_seqs[r.name] = seq
-
- with open(args.query_fofn) as fofn:
- for fn in fofn:
- fn = fn.strip()
- f = FastaReader(fn) # take one commnad line argument of the input fasta file name
- for r in f:
- seq = r.sequence.upper()
- #if fivemer_entropy(seq) < 4:
- # continue
- q_seqs[r.name] = seq
-
-
- pool = mp.Pool(args.n_core)
- K = 14
- build_look_up(seqs, K)
- m_pool = mp.Pool(args.d_core)
-
-
- #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
- for r in pool.imap(get_candidate_aln, lookup_data_iterator(q_seqs, m_pool)):
- for h in r:
- print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_qrm_0.py b/FALCON/src/py_scripts_v0.1/falcon_qrm_0.py
deleted file mode 100755
index 2cb6e77..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_qrm_0.py
+++ /dev/null
@@ -1,378 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from falcon_kit import *
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-import multiprocessing as mp
-from multiprocessing import sharedctypes
-from ctypes import *
-import math
-
-global sa_ptr, sda_ptr, lk_ptr
-global q_seqs,t_seqs, seqs
-
-seqs = []
-RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-
-all_fivemers = []
-cmap = {0:"A", 1:"T", 2:"C", 3:"G"}
-for i in range(1024):
- mer = []
- for j in range(5):
- mer.append( cmap[ i >> (2 *j) & 3 ])
- all_fivemers.append("".join(mer))
-
-def fivemer_entropy(seq):
- five_mer_count = {}
-
- for i in range(len(seq)-5):
- five_mer = seq[i:i+5]
- five_mer_count.setdefault(five_mer, 0)
- five_mer_count[five_mer] += 1
-
- entropy = 0.0
- for five_mer in all_fivemers:
- p = five_mer_count.get(five_mer, 0) + 1.0
- p /= len(seq)
- entropy += - p * math.log(p)
-
- return entropy
-
-def get_alignment(seq1, seq0):
-
- K = 8
- lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
- sa_ptr = kup.allocate_seq( len(seq0) )
- sda_ptr = kup.allocate_seq_addr( len(seq0) )
- kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
- kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
- kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
- #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
- aln_range = aln_range_ptr[0]
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score
- e1 += K + K/2
- e0 += K + K/2
- kup.free_aln_range(aln_range)
- len_1 = len(seq1)
- len_0 = len(seq0)
- if e1 > len_1:
- e1 = len_1
- if e0 > len_0:
- e0 = len_0
-
- aln_size = 1
- if e1 - s1 > 500:
-
- #aln_size = max( e1-s1, e0-s0 )
- #aln_score = int(km_score * 2)
- #aln_q_s = s1
- #aln_q_e = e1
- #aln_t_s = s0
- #aln_t_e = e0
-
- alignment = DWA.align(seq1[s1:e1], e1-s1,
- seq0[s0:e0], e0-s0,
- 500, 0)
- aln_size = alignment[0].aln_str_size
- aln_score = 4 * alignment[0].aln_str_size - 5 * alignment[0].dist
- aln_q_s = alignment[0].aln_q_s
- aln_q_e = alignment[0].aln_q_e
- aln_t_s = alignment[0].aln_t_s
- aln_t_e = alignment[0].aln_t_e
- assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
- #print aln_str1
- #print aln_str0
-
- if aln_size > 500:
- contain_status = "overlap"
- DWA.free_alignment(alignment)
-
- kup.free_seq_addr_array(sda_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_kmer_lookup(lk_ptr)
-
- if e1 - s1 > 500 and aln_size > 500:
- return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
- else:
- return 0, 0, 0, 0, 0, 0, "none"
-
-def get_candidate_aln(hit_input):
-
- global q_seqs, seqs, t_seqs, q_len
- q_name, hit_index_f, hit_index_r = hit_input
- q_seq = q_seqs[q_name]
-
- rtn = []
- hit_index = hit_index_f
- c = collections.Counter(hit_index)
- s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
-
- hit_data = []
- hit_ids = set()
- for p, hit_count in s:
- hit_id = seqs[p][0]
- if hit_id == q_name or hit_id in hit_ids:
- continue
- if hit_id not in hit_ids:
- hit_ids.add(hit_id)
- hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
-
- hit_data.sort( key=lambda x:-x[2] )
-
- target_count = {}
- total_hit = 0
-
- for hit in hit_data:
- hit_id = hit[0]
- hit_count = hit[3]
- target_count.setdefault(hit_id, 0)
- if target_count[hit_id] > 64:
- continue
- if total_hit > 64:
- continue
- seq1, seq0 = q_seq, hit[1]
- aln_data = get_alignment(seq1, seq0)
- if rtn != None:
-
- s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
- if c_status == "none":
- continue
- """
- if e1 - s1 < 5000:
- if -aln_score > -8000:
- continue
- if (100.0*aln_score/(aln_size+1)) < 150:
- continue
- """
- target_count[hit_id] += 1
- total_hit += 1
- rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
- 0, s1, e1, len(seq1),
- 0, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- hit_index = hit_index_r
- c = collections.Counter(hit_index)
- s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
-
- hit_data = []
- hit_ids = set()
- for p, hit_count in s:
- hit_id = seqs[p][0]
- if hit_id == q_name or hit_id in hit_ids:
- continue
- if hit_id not in hit_ids:
- hit_ids.add(hit_id)
- hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
-
- hit_data.sort( key=lambda x:-x[2] )
-
- target_count = {}
- total_hit = 0
-
- for hit in hit_data:
- hit_id = hit[0]
- hit_count = hit[3]
- target_count.setdefault(hit_id, 0)
- if target_count[hit_id] > 64:
- continue
- if total_hit > 64:
- continue
- seq1, seq0 = r_q_seq, hit[1]
- aln_data = get_alignment(seq1, seq0)
- if rtn != None:
- s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
- if c_status == "none":
- continue
- """
- if e1 - s1 < 5000:
- if -aln_score > -8000:
- continue
- if (100.0*aln_score/(aln_size+1)) < 150:
- continue
- """
- target_count[hit_id] += 1
- total_hit += 1
- rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)),
- 0, len(seq1) - e1, len(seq1) - s1, len(seq1),
- 1, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
-
- return rtn
-
-def build_look_up(seqs, K):
- global sa_ptr, sda_ptr, lk_ptr
-
- total_index_base = len(seqs) * 1000
- sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- kup.init_seq_array(c_sa_ptr, total_index_base)
-
- sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
-
- lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
- kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
-
- start = 0
- for r_name, seq in seqs:
- kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
- start += 1000
-
- kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 1024)
-
- #return sda_ptr, sa_ptr, lk_ptr
-
-
-
-def get_candidate_hits(q_name):
-
- global sa_ptr, sda_ptr, lk_ptr
- global q_seqs
-
- K = 14
- q_seq = q_seqs[q_name]
-
- rtn = []
-
- c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
- c_sa_ptr = cast(sa_ptr, POINTER(base_t))
- c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
-
- r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
- kup.free_kmer_match(kmer_match_ptr)
- return q_name, hit_index_f, hit_index_r
-
-
-def q_names( q_seqs ):
- for q_name, q_seq in q_seqs.items():
- yield q_name
-
-
-def lookup_data_iterator( q_seqs, m_pool ):
- for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
- yield mr
-
-
-if __name__ == "__main__":
- import argparse
- parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
- parser.add_argument('target_fofn', help='a fasta fofn as the target sequences for overlapping')
- parser.add_argument('query_fofn', help='a fasta fofn to be overlapped with sequence in target')
- parser.add_argument('--min_len', type=int, default=4000,
- help='minimum length of the reads to be considered for overlapping')
- parser.add_argument('--n_core', type=int, default=1,
- help='number of processes used for detailed overlapping evalution')
- parser.add_argument('--d_core', type=int, default=1,
- help='number of processes used for k-mer matching')
-
-
- args = parser.parse_args()
-
- q_seqs = {}
- t_seqs = {}
- if args.min_len < 1200:
- args.min_len = 1200
-
- with open(args.target_fofn) as fofn:
- for fn in fofn:
- fn = fn.strip()
- f = FastaReader(fn) # take one commnad line argument of the input fasta file name
- for r in f:
- if len(r.sequence) < args.min_len:
- continue
- seq = r.sequence.upper()
- for start in range(0, len(seq), 1000):
- if start+1000 > len(seq):
- break
- subseq = seq[start: start+1000]
- #if fivemer_entropy(subseq) < 4:
- # continue
- seqs.append( (r.name, subseq) )
- subseq = seq[-1000:]
- #if fivemer_entropy(subseq) < 4:
- # continue
- #seqs.append( (r.name, seq[:1000]) )
- seqs.append( (r.name, subseq) )
-
- t_seqs[r.name] = seq
-
- with open(args.query_fofn) as fofn:
- for fn in fofn:
- fn = fn.strip()
- f = FastaReader(fn) # take one commnad line argument of the input fasta file name
- for r in f:
- #if len(r.sequence) < args.min_len:
- # continue
- seq = r.sequence.upper()
- if fivemer_entropy(seq) < 4:
- continue
- q_seqs[r.name] = seq
-
-
- pool = mp.Pool(args.n_core)
- K = 14
- build_look_up(seqs, K)
- m_pool = mp.Pool(args.d_core)
-
-
- #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
- for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
- for h in r:
- print " ".join([str(x) for x in h])
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_sense.py b/FALCON/src/py_scripts_v0.1/falcon_sense.py
deleted file mode 100644
index 26f1954..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_sense.py
+++ /dev/null
@@ -1,248 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from ctypes import *
-import sys
-from multiprocessing import Pool
-import os
-import falcon_kit
-
-module_path = falcon_kit.__path__[0]
-
-falcon = CDLL(os.path.join(module_path, "falcon.so"))
-
-falcon.generate_consensus.argtypes = [ POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double ]
-falcon.generate_consensus.restype = POINTER(falcon_kit.ConsensusData)
-falcon.free_consensus_data.argtypes = [ POINTER(falcon_kit.ConsensusData) ]
-
-
-def get_alignment(seq1, seq0, edge_tolerance = 1000):
-
- kup = falcon_kit.kup
- K = 8
- lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
- sa_ptr = kup.allocate_seq( len(seq0) )
- sda_ptr = kup.allocate_seq_addr( len(seq0) )
- kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
- kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
- kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
- #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
- aln_range = aln_range_ptr[0]
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score
- e1 += K + K/2
- e0 += K + K/2
- kup.free_aln_range(aln_range)
- len_1 = len(seq1)
- len_0 = len(seq0)
- if e1 > len_1:
- e1 = len_1
- if e0 > len_0:
- e0 = len_0
-
- aln_size = 1
- if e1 - s1 > 500:
-
- aln_size = max( e1-s1, e0-s0 )
- aln_score = int(km_score * 48)
- aln_q_s = s1
- aln_q_e = e1
- aln_t_s = s0
- aln_t_e = e0
-
- kup.free_seq_addr_array(sda_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_kmer_lookup(lk_ptr)
-
- if s1 > edge_tolerance and s0 > edge_tolerance:
- return 0, 0, 0, 0, 0, 0, "none"
-
- if len_1 - e1 > edge_tolerance and len_0 - e0 > edge_tolerance:
- return 0, 0, 0, 0, 0, 0, "none"
-
-
- if e1 - s1 > 500 and aln_size > 500:
- return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
- else:
- return 0, 0, 0, 0, 0, 0, "none"
-
-def get_consensus_without_trim( c_input ):
- seqs, seed_id, config = c_input
- min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
- if len(seqs) > max_n_read:
- seqs = seqs[:max_n_read]
- seqs_ptr = (c_char_p * len(seqs))()
- seqs_ptr[:] = seqs
- consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(seqs), min_cov, K,
- local_match_count_window, local_match_count_threshold, min_idt )
-
- consensus = string_at(consensus_data_ptr[0].sequence)[:]
- eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
- falcon.free_consensus_data( consensus_data_ptr )
- del seqs_ptr
- return consensus, seed_id
-
-def get_consensus_with_trim( c_input ):
- seqs, seed_id, config = c_input
- min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
- trim_seqs = []
- seed = seqs[0]
- for seq in seqs[1:]:
- aln_data = get_alignment(seq, seed, edge_tolerance)
- s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
- if c_status == "none":
- continue
- if aln_score > 1000 and e1 - s1 > 500:
- e1 -= trim_size
- s1 += trim_size
- trim_seqs.append( (e1-s1, seq[s1:e1]) )
- trim_seqs.sort(key = lambda x:-x[0]) #use longest alignment first
- trim_seqs = [x[1] for x in trim_seqs]
-
- if len(trim_seqs) > max_n_read:
- trim_seqs = trim_seqs[:max_n_read]
-
- trim_seqs = [seed] + trim_seqs
-
-
- seqs_ptr = (c_char_p * len(trim_seqs))()
- seqs_ptr[:] = trim_seqs
- consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(trim_seqs), min_cov, K,
- local_match_count_window, local_match_count_threshold, min_idt )
- consensus = string_at(consensus_data_ptr[0].sequence)[:]
- eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
- falcon.free_consensus_data( consensus_data_ptr )
- del seqs_ptr
- return consensus, seed_id
-
-
-def get_seq_data(config):
- min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
- seqs = []
- seed_id = None
- seqs_data = []
- read_ids = set()
- with sys.stdin as f:
- for l in f:
- l = l.strip().split()
- if len(l) != 2:
- continue
- if l[0] not in ("+", "-"):
- if len(l[1]) > 100:
- if len(seqs) == 0:
- seqs.append(l[1]) #the "seed"
- seed_id = l[0]
- if l[0] not in read_ids: #avoidng using the same read twice
- seqs.append(l[1])
- elif l[0] == "+":
- if len(seqs) > 10:
- seqs.sort( key=lambda x: -len(x) )
- yield (seqs[:max_n_read], seed_id, config)
- #seqs_data.append( (seqs, seed_id) )
- seqs = []
- read_id = set()
- seed_id = None
- elif l[0] == "-":
- #yield (seqs, seed_id)
- #seqs_data.append( (seqs, seed_id) )
- break
-
-if __name__ == "__main__":
- import argparse
- import re
- parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
- parser.add_argument('--n_core', type=int, default=24,
- help='number of processes used for generating consensus')
- parser.add_argument('--local_match_count_window', type=int, default=12,
- help='local match window size')
- parser.add_argument('--local_match_count_threshold', type=int, default=6,
- help='local match count threshold')
- parser.add_argument('--min_cov', type=int, default=6,
- help='minimum coverage to break the consensus')
- parser.add_argument('--max_n_read', type=int, default=500,
- help='minimum number of reads used in generating the consensus')
- parser.add_argument('--trim', action="store_true", default=False,
- help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
- parser.add_argument('--output_full', action="store_true", default=False,
- help='output uncorrected regions too')
- parser.add_argument('--output_multi', action="store_true", default=False,
- help='output multi correct regions')
- parser.add_argument('--min_idt', type=float, default=0.70,
- help='minimum identity of the alignments used for correction')
- parser.add_argument('--edge_tolerance', type=int, default=1000,
- help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
- parser.add_argument('--trim_size', type=int, default=50,
- help='the size for triming both ends from initial sparse aligned region')
- good_region = re.compile("[ACGT]+")
- args = parser.parse_args()
- exe_pool = Pool(args.n_core)
- if args.trim:
- get_consensus = get_consensus_with_trim
- else:
- get_consensus = get_consensus_without_trim
-
- K = 8
- config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
- args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size
- for res in exe_pool.imap(get_consensus, get_seq_data(config)):
- cns, seed_id = res
- if args.output_full == True:
- if len(cns) > 500:
- print ">"+seed_id+"_f"
- print cns
- else:
- cns = good_region.findall(cns)
- if len(cns) == 0:
- continue
- if args.output_multi == True:
- seq_i = 0
- for cns_seq in cns:
- if len(cns_seq) > 500:
- print ">"+seed_id+"_%d" % seq_i
- print cns_seq
- seq_i += 1
- else:
- cns.sort(key = lambda x: len(x))
- if len(cns[-1]) > 500:
- print ">"+seed_id
- print cns[-1]
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_ucns_data.py b/FALCON/src/py_scripts_v0.1/falcon_ucns_data.py
deleted file mode 100644
index 7d206fd..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_ucns_data.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import sys
-import os
-
-
-rcmap = dict(zip("ACGTacgtNn-", "TGCAtgcaNn-"))
-
-if __name__ == "__main__":
- import argparse
- import re
- from pbcore.io import FastaReader
-
- tiling_path = {}
- with open("all_tiling_path_c") as f:
- for l in f:
- l = l.strip().split()
- tiling_path.setdefault( l[0], [])
-
- offset = int(l[1])
- node_id = l[2].split(":")
- s = int(l[3])
- e = int(l[4])
-
- tiling_path[ l[0] ].append( (offset, node_id[0], node_id[1], s, e) )
-
- f = FastaReader("preads.fa")
- seq_db = {}
- for r in f:
- seq_db[r.name] = r.sequence
-
- f = FastaReader("primary_tigs_c.fa")
- p_tigs_db = {}
- for r in f:
- p_tigs_db[r.name] = r.sequence
-
- for p_tig_id in p_tigs_db:
- pread_data = {}
- offsets = []
- seqs = []
- p_tig = p_tigs_db[p_tig_id]
- #if len(tiling_path[p_tig_id]) <= 2:
- # continue
- print p_tig_id, 0, p_tig
- for offset, s_id, end, s, e in tiling_path[p_tig_id]:
- seq = seq_db[s_id]
- if end == "B":
- s, e = e, s
- offset = offset - len(seq)
- seq = "".join([rcmap[c] for c in seq[::-1]])
- else:
- offset = offset - len(seq)
- print s_id, offset, seq
-
- print "+ + +"
-
- f = FastaReader("a_nodup.fa")
- a_tigs_db = {}
- for r in f:
- a_tigs_db[r.name] = r.sequence
-
- for a_tig_id in a_tigs_db:
- pread_data = {}
- offsets = []
- seqs = []
- a_tig = a_tigs_db[a_tig_id]
- #if len(tiling_path[a_tig_id]) <= 2:
- # continue
- print a_tig_id, 0, a_tig
- for offset, s_id, end, s, e in tiling_path[a_tig_id]:
- seq = seq_db[s_id]
- if end == "B":
- s, e = e, s
- offset = offset - len(seq)
- seq = "".join([rcmap[c] for c in seq[::-1]])
- else:
- offset = offset - len(seq)
- print s_id, offset, seq
-
- print "+ + +"
-
- print "- - -"
-
diff --git a/FALCON/src/py_scripts_v0.1/falcon_utgcns.py b/FALCON/src/py_scripts_v0.1/falcon_utgcns.py
deleted file mode 100644
index bd2cc1b..0000000
--- a/FALCON/src/py_scripts_v0.1/falcon_utgcns.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from ctypes import *
-import sys
-from multiprocessing import Pool
-import os
-import falcon_kit
-
-module_path = falcon_kit.__path__[0]
-
-falcon = CDLL(os.path.join(module_path, "falcon.so"))
-"""
-consensus_data * generate_utg_consensus( char ** input_seq,
- seq_coor_t *offset,
- unsigned int n_seq,
- unsigned min_cov,
- unsigned K,
- double min_idt) {
-"""
-falcon.generate_utg_consensus.argtypes = [ POINTER(c_char_p), POINTER(falcon_kit.seq_coor_t), c_uint, c_uint, c_uint, c_double ]
-falcon.generate_utg_consensus.restype = POINTER(falcon_kit.ConsensusData)
-falcon.free_consensus_data.argtypes = [ POINTER(falcon_kit.ConsensusData) ]
-
-rcmap = dict(zip("ACGTacgtNn-", "TGCAtgcaNn-"))
-
-def get_consensus(c_input):
- t_id, seqs, offsets, config = c_input
- K = config[0]
- seqs_ptr = (c_char_p * len(seqs))()
- seqs_ptr[:] = seqs
- offset_ptr = (c_long * len(seqs))( *offsets )
- consensus_data_ptr = falcon.generate_utg_consensus( seqs_ptr, offset_ptr, len(seqs), 0, K, 0.)
- consensus = string_at(consensus_data_ptr[0].sequence)[:]
- del seqs_ptr
- del offset_ptr
- falcon.free_consensus_data( consensus_data_ptr )
- return consensus, t_id
-
-def echo(c_input):
-
- t_id, seqs, offsets, config = c_input
-
- return len(seqs), "test"
-
-def get_seq_data(config):
- seqs = []
- offsets = []
- seed_id = None
- with sys.stdin as f:
- for l in f:
- l = l.strip().split()
- if len(l) != 3:
- continue
- if l[0] not in ("+", "-"):
- if len(seqs) == 0:
- seqs.append(l[2]) #the "seed"
- offsets.append( int(l[1]) )
- seed_id = l[0]
- else:
- seqs.append(l[2])
- offsets.append( int(l[1]) )
- elif l[0] == "+":
- yield (seed_id, seqs, offsets, config)
- seqs = []
- offsets = []
- seed_id = None
- elif l[0] == "-":
- break
-
-if __name__ == "__main__":
- import argparse
- import re
- parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
- parser.add_argument('--n_core', type=int, default=4,
- help='number of processes used for generating consensus')
- args = parser.parse_args()
- exe_pool = Pool(args.n_core)
- K = 8
- config = (K, )
- for res in exe_pool.imap(get_consensus, get_seq_data(config)):
- #for res in exe_pool.imap(echo, get_seq_data(config)):
- #for res in map(echo, get_seq_data(config)):
- #for res in map(get_consensus, get_seq_data(config)):
- cns, t_id = res
- print ">"+t_id+"|tigcns"
- print cns
-
diff --git a/FALCON/src/py_scripts_v0.1/get_ovl.sh b/FALCON/src/py_scripts_v0.1/get_ovl.sh
deleted file mode 100644
index 417f03b..0000000
--- a/FALCON/src/py_scripts_v0.1/get_ovl.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {} | python overlap_filter_step1.py > {}.ignore" ::: *.las
-rm all.ignore
-cat *.ignore > all.ignore
-/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {} | python overlap_filter_step2.py > {}.rc" ::: *.las
-cat *.rc > rc_out_all
-rm *.rc
-/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {} | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/FALCON/src/py_scripts_v0.1/get_rdata.py b/FALCON/src/py_scripts_v0.1/get_rdata.py
deleted file mode 100755
index f4fbf99..0000000
--- a/FALCON/src/py_scripts_v0.1/get_rdata.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import sys
-import glob
-#import pkg_resources
-import uuid
-from datetime import datetime
-
-from collections import Counter
-from multiprocessing import Pool
-#from pbtools.pbdagcon.q_sense import *
-import os
-
-"""
-try:
- __p4revision__ = "$Revision: #4 $"
- __p4change__ = "$Change: 121571 $"
- revNum = int(__p4revision__.strip("$").split(" ")[1].strip("#"))
- changeNum = int(__p4change__.strip("$").split(":")[-1])
- __version__ = "%s-r%d-c%d" % ( pkg_resources.require("pbtools.pbhgap")[0].version, revNum, changeNum )
-except:
- __version__ = "pbtools.hbar-dtk-github"
-"""
-
-query_fasta_fn = sys.argv[1]
-target_fasta_fn = sys.argv[2]
-m4_fofn = sys.argv[3]
-bestn = int(sys.argv[4])
-group_id = int(sys.argv[5])
-num_chunk = int(sys.argv[6])
-min_cov = int(sys.argv[7])
-max_cov = int(sys.argv[8])
-trim_align = int(sys.argv[9])
-trim_plr = int(sys.argv[10])
-
-
-rmap = dict(zip("ACGTNacgt-","TGCANntgca-"))
-def rc(seq):
- return "".join([rmap[c] for c in seq[::-1]])
-
-"""0x239fb832/0_590 0x722a1e26 -1843 81.6327 0 62 590 590 0 6417 6974 9822 254 11407 -74.5375 -67.9 1"""
-query_to_target = {}
-with open(m4_fofn) as fofn:
- for fn in fofn:
- fn = fn.strip()
- with open(fn) as m4_f:
- for l in m4_f:
- d = l.strip().split()
- id1, id2 = d[:2]
- #if -noSplitSubread not used, we will need the following line
- #id1 = id1.split("/")[0]
- if id1 == id2:
- continue
- if hash(id2) % num_chunk != group_id:
- continue
- if int(d[2]) > -1000: continue
- if int(d[11]) < 4000: continue
- query_to_target.setdefault(id1, [])
- query_to_target[id1].append( (int(d[2]), l) )
-
-target_to_query = {}
-for id1 in query_to_target:
- query_to_target[id1].sort()
- rank = 0
- for s, ll in query_to_target[id1][:bestn]:
- l = ll.strip()
- d = l.split()
- id1, id2 = d[:2]
- target_to_query.setdefault(id2,[])
- target_to_query[id2].append( ( (int(d[5])-int(d[6]), int(d[2])), l ) )
- #target_to_query[id2].append( ( int(d[2]), l ) )
- #rank += 1
-
-from pbcore.io import FastaIO
-query_data = {}
-with open(query_fasta_fn) as fofn:
- for fa_fn in fofn:
- fa_fn = fa_fn.strip()
- f_s = FastaIO.FastaReader(fa_fn)
- for s in f_s:
- id1 = s.name
- if id1 not in query_to_target:
- continue
- query_data[id1]=s.sequence
- f_s.file.close()
-
-target_data = {}
-with open(target_fasta_fn) as fofn:
- for fa_fn in fofn:
- fa_fn = fa_fn.strip()
- f_s = FastaIO.FastaReader(fa_fn)
- for s in f_s:
- id2 = s.name
- if hash(id2) % num_chunk != group_id:
- continue
- target_data[id2]=s.sequence
- f_s.file.close()
-
-
-ec_data = []
-base_count = Counter()
-r_count =0
-
-for id2 in target_to_query:
- if len(target_to_query[id2])<10:
- continue
- if id2 not in target_data:
- continue
-
- ref_data = (id2, target_data[id2])
- ref_len = len(target_data[id2])
- base_count.clear()
- base_count.update( target_data[id2] )
- if 1.0*base_count.most_common(1)[0][1]/ref_len > 0.8: # don't do preassmbly if a read is of >80% of the same base
- continue
- read_data = []
-
- query_alignment = target_to_query[id2]
- query_alignment.sort() # get better alignment
- total_bases = 0
- max_cov_bases = max_cov * ref_len * 1.2
- #min_cov_bases = min_cov * ref_len * 3
-
- for rank_score, l in query_alignment:
- rank, score = rank_score
- #score = rank_score
- l = l.split()
- id1 = l[0]
- #if -noSplitSubread not used, we will need the following line
- #id1 = id1.split("/")[0]
- q_s = int(l[5]) + trim_align
- q_e = int(l[6]) - trim_align
- strand = int(l[8])
- t_s = int(l[9])
- t_e = int(l[10])
- t_l = int(l[11])
- #if strand == 1:
- # t_s, t_e = t_l - t_e, t_l - t_s
- # t_s += trim_align
- # t_e -= trim_align
-
- if q_e - q_s < 400:
- continue
- total_bases += q_e - q_s
- if total_bases > max_cov_bases:
- break
- q_seq = query_data[id1][q_s:q_e]
- read_data.append( ( "%s/0/%d_%d" % (id1, q_s, q_e), q_s, q_e, q_seq, strand, t_s, t_e) )
-
- if len(read_data) > 5:
- r_count += 1
- t_id, t_seq = ref_data
- t_len = len(t_seq)
- print t_id, t_seq
- for r in read_data:
- q_id, q_s, q_e, q_seq, strand, t_s, t_e = r
- if strand == 1:
- q_seq = rc(q_seq)
- print q_id, q_seq
- #if r_count > 600:
- # break
- print "+ +"
-print "- -"
-
-#output_dir,dumb = os.path.split( os.path.abspath( output_file ) )
-#output_log = open ( os.path.join( output_dir, "j%02d.log" % group_id ), "w" )
-
-
-
-
diff --git a/FALCON/src/py_scripts_v0.1/overlapper.py b/FALCON/src/py_scripts_v0.1/overlapper.py
deleted file mode 100644
index 3a040d2..0000000
--- a/FALCON/src/py_scripts_v0.1/overlapper.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from falcon_kit import kup, falcon, DWA, get_consensus, get_alignment
-from pbcore.io import FastaReader
-import numpy as np
-import collections
-import sys
-
-seqs = []
-q_seqs = {}
-f = FastaReader(sys.argv[1]) # take one commnad line argument of the input fasta file name
-
-for r in f:
- if len(r.sequence) < 6000:
- continue
- seq = r.sequence.upper()
- seqs.append( (r.name, seq[:500], seq[-500:] ) )
- q_seqs[r.name] = seq
-
-
-total_index_base = len(seqs) * 1000
-print total_index_base
-sa_ptr = kup.allocate_seq( total_index_base )
-sda_ptr = kup.allocate_seq_addr( total_index_base )
-K=14
-lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
-
-start = 0
-for r_name, prefix, suffix in seqs:
- kup.add_sequence( start, K, prefix, 500, sda_ptr, sa_ptr, lk_ptr)
- start += 500
- kup.add_sequence( start, K, suffix, 500, sda_ptr, sa_ptr, lk_ptr)
- start += 500
-#kup.mask_k_mer(1 << (K * 2), lk_ptr, 256)
-
-kup.mask_k_mer(1 << (K * 2), lk_ptr, 64)
-
-def get_alignment(seq1, seq0):
-
- K = 8
- lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
- sa_ptr = kup.allocate_seq( len(seq0) )
- sda_ptr = kup.allocate_seq_addr( len(seq0) )
- kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
- #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
- if e1 - s1 > 500:
- #s1 = 0 if s1 < 14 else s1 - 14
- #s2 = 0 if s2 < 14 else s2 - 14
- e1 = len(seq1) if e1 >= len(seq1)-2*K else e1 + K*2
- e2 = len(seq0) if e2 >= len(seq0)-2*K else e2 + K*2
-
- alignment = DWA.align(seq1[s1:e1], e1-s1,
- seq0[s2:e2], e2-s2,
- 100, 0)
- #print seq1[s1:e1]
- #print seq0[s2:e2]
- #if alignment[0].aln_str_size > 500:
-
- #aln_str1 = alignment[0].q_aln_str
- #aln_str0 = alignment[0].t_aln_str
- aln_size = alignment[0].aln_str_size
- aln_dist = alignment[0].dist
- aln_q_s = alignment[0].aln_q_s
- aln_q_e = alignment[0].aln_q_e
- aln_t_s = alignment[0].aln_t_s
- aln_t_e = alignment[0].aln_t_e
- assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
- #print aln_str1
- #print aln_str0
-
- DWA.free_alignment(alignment)
-
- kup.free_seq_addr_array(sda_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_kmer_lookup(lk_ptr)
- if e1 - s1 > 500 and aln_size > 500:
- return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist
- else:
- return None
-
-
-def get_ovelap_alignment(seq1, seq0):
-
- K = 8
- lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
- sa_ptr = kup.allocate_seq( len(seq0) )
- sda_ptr = kup.allocate_seq_addr( len(seq0) )
- kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
- #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
- len_1 = len(seq1)
- len_0 = len(seq0)
- do_aln = False
- contain_status = "none"
- if e1 - s1 > 500:
- if s1 < 100 and len_1 - e1 < 100:
- do_aln = False
- contain_status = "contains"
- elif s0 < 100 and len_0 - e0 < 100:
- do_aln = False
- contain_status = "contained"
- else:
- do_aln = True
- if s0 < s1:
- s1 -= s0 #assert s1 > 0
- s0 = 0
- e1 = len_1
- e0 = len_1 - s1 if len_1 - s1 < len_0 else len_0
- if e0 == len_0:
- do_aln = False
- contain_status = "contained"
-
- if s1 <= s0:
- s0 -= s1 #assert s1 > 0
- s1 = 0
- e0 = len_0
- e1 = len_0 - s0 if len_0 - s0 < len_1 else len_1
- if e1 == len_1:
- do_aln = False
- contain_status = "contains"
-
-
- if do_aln:
- alignment = DWA.align(seq1[s1:e1], e1-s1,
- seq0[s0:e0], e0-s0,
- 500, 0)
- #print seq1[s1:e1]
- #print seq0[s2:e2]
- #if alignment[0].aln_str_size > 500:
-
- #aln_str1 = alignment[0].q_aln_str
- #aln_str0 = alignment[0].t_aln_str
- aln_size = alignment[0].aln_str_size
- aln_dist = alignment[0].dist
- aln_q_s = alignment[0].aln_q_s
- aln_q_e = alignment[0].aln_q_e
- aln_t_s = alignment[0].aln_t_s
- aln_t_e = alignment[0].aln_t_e
- assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
- #print aln_str1
- #print aln_str0
- if aln_size > 500:
- contain_status = "overlap"
- DWA.free_alignment(alignment)
-
- kup.free_seq_addr_array(sda_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_kmer_lookup(lk_ptr)
-
- if e1 - s1 > 500 and do_aln and aln_size > 500:
- #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
- return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
- else:
- return 0, 0, 0, 0, 0, 0, contain_status
-
-rc_map = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
-with open("test_ovlp.dat","w") as f:
- for name, q_seq in q_seqs.items():
- kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index = np.array(kmer_match.target_pos[0:count])/500
- kup.free_kmer_match(kmer_match_ptr)
-
- c = collections.Counter(hit_index)
- s = [c[0] for c in c.items() if c[1] >50]
- #s.sort()
- targets = set()
- for p in s:
- hit_id = seqs[p/2][0]
- if hit_id in targets or hit_id == name:
- continue
- targets.add(hit_id)
- seq1, seq0 = q_seq, q_seqs[hit_id ]
- rtn = get_ovelap_alignment(seq1, seq0)
- #rtn = get_alignment(seq1, seq0)
- if rtn != None:
-
- s1, e1, s2, e2, aln_size, aln_dist, c_status = rtn
- #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0), aln_size, aln_dist
- print >>f, hit_id, name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 0, s2, e2, len(seq0), 0, s1, e1, len(seq1), c_status
-
- r_q_seq = "".join([rc_map[c] for c in q_seq[::-1]])
-
- kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- count = kmer_match.count
- hit_index = np.array(kmer_match.target_pos[0:count])/500
- kup.free_kmer_match(kmer_match_ptr)
-
- c = collections.Counter(hit_index)
- s = [c[0] for c in c.items() if c[1] >50]
- #s.sort()
- targets = set()
- for p in s:
- hit_id = seqs[p/2][0]
- if hit_id in targets or hit_id == name:
- continue
- targets.add(hit_id)
- seq1, seq0 = r_q_seq, q_seqs[hit_id]
- rtn = get_ovelap_alignment(seq1, seq0)
- #rtn = get_alignment(seq1, seq0)
- if rtn != None:
- s1, e1, s2, e2, aln_size, aln_dist, c_status = rtn
- #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0), aln_size, aln_dist
- print >>f, hit_id, name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 0, s2, e2, len(seq0), 1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status
-
diff --git a/FALCON/src/py_scripts_v0.1/ovlp_filter.sh b/FALCON/src/py_scripts_v0.1/ovlp_filter.sh
deleted file mode 100644
index 608389e..0000000
--- a/FALCON/src/py_scripts_v0.1/ovlp_filter.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-source /mnt/secondary/Share/HBAR_03202013/bin/activate
-parallel -j 24 "LA4Falcon -mo -H10000 {} | python overlap_filter_step1.py > {}.ignore" ::: *.las
-cat *.ignore > all.ignore
-parallel -j 24 "LA4Falcon -mo -H10000 {} | python overlap_filter_step2.py > {}.rc" ::: *.las
-cat *.rc > rc_out_all
-parallel -j 24 "LA4Falcon -mo -H10000 {} | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/FALCON/src/py_scripts_v0.1/redis_graph.py b/FALCON/src/py_scripts_v0.1/redis_graph.py
deleted file mode 100644
index 555c090..0000000
--- a/FALCON/src/py_scripts_v0.1/redis_graph.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import redis
-import sys
-from pbcore.io import FastaReader
-
-
-r = redis.StrictRedis(host='localhost', port=6379, db=0)
-
-class RedisList(object):
-
- def __init__(self, rs):
- self._rs = rs
- self.id_ = "pid:" + str( id(self) )
-
- def append(self, value):
- self._rs.rpush( self.id_, value)
-
- def __len__(self):
- return self._rs.llen( self.id_ )
-
- def __getitem__(self, i):
- return self._rs.lrange( self.id_, i, i)
-
- def pylist(self):
- return self._rs.lrange( self.id_, 0, -1)
-
- def __del__(self):
- self._rs.delete(self.id_)
-
-class RedisDict(object):
-
- def __init__(self, rs):
- self._rs = rs
- self.id_ = "pid:" + str( id(self) )
-
- def __setitem__(self, key, value):
- self._rs.hset( self.id_, key, value )
-
- def __getitem__(self, key):
- return self._rs.hget( self.id_, key )
-
- def __delitem__(self, key):
- return self._rs.hdel( self.id_, key)
-
-
- def __len__(self):
- return self._rs.hlen( self.id_ )
-
- def keys(self):
- return self._rs.hgetall( self.id_ ).keys()
-
- def values(self):
- return self._rs.hgetall( self.id_ ).values()
-
- def pydict(self):
- return self._rs.hgetall( self.id_ )
-
- def __del__(self):
- self._rs.delete(self.id_)
-
-def test_list():
- x = RedisList(r)
- x.append( "1" )
- x.append( "2" )
- print len(x)
- print x.pylist()
- del x
-
- y = RedisDict(r)
- y["a"] = "b"
- y["b"] = 1
- print y["a"]
- del y["a"]
- print y.values()
- print y.keys()
- print y.pydict()
- del y
-
-if __name__ == "__main__":
- test_list()
diff --git a/FALCON/src/py_scripts_v0.1/remove_dup_ctg.py b/FALCON/src/py_scripts_v0.1/remove_dup_ctg.py
deleted file mode 100755
index 3164eb6..0000000
--- a/FALCON/src/py_scripts_v0.1/remove_dup_ctg.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-import pbcore.io
-
-import sys
-"""nucmer -maxmatch all_tigs.fa all_tigs.fa -p all_tigs_self >& /dev/null"""
-"""show-coords -o -H -T all_tigs_self.delta | grep CONTAINS | awk '$7>96' | awk '{print $9}' | sort -u > all_tigs_duplicated_ids"""
-
-id_to_remove = set()
-with open("all_tigs_duplicated_ids") as f:
- for l in f:
- l = l.strip().split("-")
- major, minor = l[:2]
- id_to_remove.add ( (major, minor) )
-
-f = pbcore.io.FastaReader("all_tigs.fa")
-with open("a-tigs_nodup.fa", "w") as f_out:
- for r in f:
- major, minor = r.name.split()[0].split("-")[:2]
- if minor == "0000":
- continue
- if (major, minor) in id_to_remove:
- continue
- if len(r.sequence) < 500:
- continue
- print >>f_out, ">"+r.name
- print >>f_out, r.sequence
-
-f = pbcore.io.FastaReader("primary_tigs_c.fa")
-with open("p-tigs_nodup.fa", "w") as f_out:
- for r in f:
- major, minor = r.name.split()[0].split("_")[:2]
- if (major, "0000") in id_to_remove:
- continue
- if len(r.sequence) < 500:
- continue
- print >>f_out, ">"+r.name
- print >>f_out, r.sequence
diff --git a/FALCON/test/helpers.py b/FALCON/test/helpers.py
index 898f7db..a6820b4 100644
--- a/FALCON/test/helpers.py
+++ b/FALCON/test/helpers.py
@@ -1,4 +1,5 @@
from nose.tools import assert_equal, assert_raises, eq_
+import os.path
def equal_list(a, b):
eq_(set(a) ^ set(b), set())
@@ -12,3 +13,6 @@ def equal_multiline(a, b):
alines = a.splitlines()
blines = b.splitlines()
equal_list(alines, blines)
+
+def get_test_data_dir():
+ return os.path.join(os.path.dirname(__file__), '..', 'test_data')
diff --git a/FALCON/test/test_calc_cutoff.py b/FALCON/test/test_calc_cutoff.py
new file mode 100644
index 0000000..4ae0d8a
--- /dev/null
+++ b/FALCON/test/test_calc_cutoff.py
@@ -0,0 +1,43 @@
+import falcon_kit.mains.calc_cutoff as mod
+import helpers
+import os.path
+import pytest
+
+def test_help():
+ try:
+ mod.main(['prog', '--help'])
+ except SystemExit:
+ pass
+
+# Note: genome_size==1 makes math easy.
+
+def test_calc_cutoff(capsys):
+ partial_capture_fn = os.path.join(helpers.get_test_data_dir(), 'calc_cutoff/partial_capture.txt')
+ assert os.path.exists(partial_capture_fn)
+ mod.main('prog --coverage 14 1 {}'.format(partial_capture_fn).split())
+ out, err = capsys.readouterr()
+ assert out == '2'
+ assert not err
+
+expected_err = """
+GenomeCoverageError: Not enough reads available for desired genome coverage (bases needed=23 > actual=22)
+User-provided genome_size: 1
+Desired coverage: 23.0
+"""
+
+def test_calc_cutoff_err():
+ partial_capture_fn = os.path.join(helpers.get_test_data_dir(), 'calc_cutoff/partial_capture.txt')
+ assert os.path.exists(partial_capture_fn)
+ with pytest.raises(Exception) as excinfo:
+ mod.main('prog --coverage 23 1 {}'.format(partial_capture_fn).split())
+ assert expected_err in str(excinfo.value)
+
+def test_calc_cutoff_errfile(monkeypatch, tmpdir):
+ fn = str(tmpdir.mkdir('tmp').join('errfile'))
+ monkeypatch.setenv('PBFALCON_ERRFILE', fn)
+ partial_capture_fn = os.path.join(helpers.get_test_data_dir(), 'calc_cutoff/partial_capture.txt')
+ assert os.path.exists(partial_capture_fn)
+ with pytest.raises(Exception) as excinfo:
+ mod.main('prog --coverage 23 1 {}'.format(partial_capture_fn).split())
+ assert expected_err in str(excinfo.value)
+ assert expected_err in open(fn).read()
diff --git a/FALCON/test/test_functional.py b/FALCON/test/test_functional.py
index bbcacaf..81c5ff3 100644
--- a/FALCON/test/test_functional.py
+++ b/FALCON/test/test_functional.py
@@ -1,5 +1,6 @@
import helpers
from nose.tools import assert_equal, assert_raises, eq_
+import pytest
import falcon_kit.functional as f
import StringIO
import collections
@@ -18,6 +19,12 @@ def test_get_daligner_job_descriptions():
helpers.equal_multiline(result[('.2', '.1', '.2')], "daligner -v -h1 -t16 -H1 -e0.7 -l1 -s1000 raw_reads.2 raw_reads.1 raw_reads.2\nLAcheck -v raw_reads *.las\nLAsort -v raw_reads.1.raw_reads.2.C0 raw_reads.1.raw_reads.2.N0 && LAmerge -v L1.1.2 raw_reads.1.raw_reads.2.C0.S raw_reads.1.raw_reads.2.N0.S && rm raw_reads.1.raw_reads.2.C0.S.las raw_reads.1.raw_reads.2.N0.S.las\nLAsort -v raw_reads.2.raw_reads.1.C0 raw_reads.2.raw_reads.1.N0 && LAmerge -v L1.2.1 raw_reads.2.raw_reads.1.C0. [...]
eq_(len(result), 2)
+def test_get_daligner_job_descriptions_with_bad_arg():
+ with pytest.raises(AssertionError) as excinfo:
+ f.get_daligner_job_descriptions(
+ 'fake_filename.txt', 'raw_reads')
+ assert "['f', 'a', 'k', 'e'" in str(excinfo.value)
+
def test_get_daligner_job_descriptions_small():
# when there is only 1 block, a special case
example_HPCdaligner = open(example_HPCdaligner_small_fn)
@@ -153,6 +160,13 @@ def test_calc_cutoff():
got = f.calc_cutoff(target, partial_capture)
eq_(expected, got)
+def test_calc_cutoff_bad_coverage():
+ target = 23 # > 22 available
+ expected_message = 'Not enough reads available for desired genome coverage (bases needed=23 > actual=22)'
+ with assert_raises(f.GenomeCoverageError) as ctx:
+ f.calc_cutoff(target, partial_capture)
+ eq_(expected_message, ctx.exception.message)
+
sample_DBdump_output = """+ R 2
+ M 0
+ H 400
diff --git a/FALCON/test/test_stats_preassembly.py b/FALCON/test/test_stats_preassembly.py
index 628e479..26a3b9b 100644
--- a/FALCON/test/test_stats_preassembly.py
+++ b/FALCON/test/test_stats_preassembly.py
@@ -2,11 +2,16 @@ import falcon_kit.stats_preassembly as M
import helpers
from cStringIO import StringIO
+def test_stats_from_sorted_readlengths():
+ stats = M.stats_from_sorted_readlengths([1,2,3,4])
+ expected = M.Stats(nreads=4, total=10, n50=3, p95=4, esize=3.0)
+ helpers.assert_equal(stats, expected)
+
def test_stats_dict():
#Stats = collections.namedtuple('FastaStats', ['nreads', 'total', 'n50', 'p95'])
- stats_raw_reads = M.Stats(100, 1000, 50, 95)
- stats_seed_reads = M.Stats(50, 500, 25, 40)
- stats_corrected_reads = M.Stats(10, 100, 5, 9)
+ stats_raw_reads = M.Stats(100, 1000, 50, 95, 0.0)
+ stats_seed_reads = M.Stats(50, 500, 25, 40, 0.0)
+ stats_corrected_reads = M.Stats(10, 100, 5, 9, 0.0)
genome_length = 19
length_cutoff = 10
frag = 1.0
@@ -25,16 +30,20 @@ def test_stats_dict():
'preassembled_seed_fragmentation': 1.0,
'preassembled_seed_truncation': 2.5,
'preassembled_yield': 0.2,
+ 'preassembled_esize': 0.0,
'raw_bases': 1000,
'raw_coverage': 52.632,
'raw_mean': 10.0,
'raw_n50': 50,
'raw_p95': 95,
'raw_reads': 100,
+ 'raw_esize': 0.0,
'seed_bases': 500,
'seed_coverage': 26.316,
'seed_mean': 10.0,
'seed_n50': 25,
'seed_p95': 40,
- 'seed_reads': 50}
+ 'seed_reads': 50,
+ 'seed_esize': 0.0,
+ }
helpers.equal_dict(result, expected)
diff --git a/FALCON/test_data/calc_cutoff/partial_capture.txt b/FALCON/test_data/calc_cutoff/partial_capture.txt
new file mode 100644
index 0000000..4b49d9c
--- /dev/null
+++ b/FALCON/test_data/calc_cutoff/partial_capture.txt
@@ -0,0 +1,5 @@
+ Bin: Count % Reads % Bases Average
+ 4: 2 0.0 0.0 xxx
+ 3: 0 0.0 0.0 xxx
+ 2: 3 0.0 0.0 xxx
+ 1: 8 0.0 0.0 xxx
diff --git a/FALCON/travis.sh b/FALCON/travis.sh
index 27dcfb9..fd74da7 100755
--- a/FALCON/travis.sh
+++ b/FALCON/travis.sh
@@ -5,16 +5,9 @@
set -vex
#env | sort
-mkdir -p fc-env
-rm -f fc-env/bin/python
-virtualenv -p python2.7 fc-env || ../virtualenv/virtualenv.py fc-env
-. fc-env/bin/activate
-python setup.py -v install
-python -c 'import falcon_kit; print falcon_kit.falcon'
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+export PATH=$PYTHONUSERBASE/bin:$PATH
-# When doctests are passing, add this:
-pip install nose
-nosetests -v test/
-nosetests -v --with-doctest falcon_kit/functional.py
-# We cannot run that on *all* modules because some include dependencies.
-# Just pypeFLOW for now, but I would rather not test dependencies.
+make install
+make test
diff --git a/bamboo_build_and_test.sh b/bamboo_build_and_test.sh
new file mode 100755
index 0000000..5447bce
--- /dev/null
+++ b/bamboo_build_and_test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#rm -rf FALCON-integrate
+
+#git clone https://github.com/PacificBiosciences/FALCON-integrate
+#cd FALCON-integrate
+pwd
+ls -l
+git submodule
+git --version
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+module unload git gcc ccache
+module load git/2.8.3
+module load gcc/4.9.2
+module load ccache/3.2.3
+#module load make
+
+set -vex
+git --version
+which gcc
+which g++
+gcc --version
+# We cannot use /bin/python without /bin/gcc.
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+export CCACHE_DIR=/mnt/secondary/Share/tmp/bamboo.mobs.ccachedir
+
+git remote -v
+ls -larth
+pwd
+#git submodule update --init #No! We must use BB for some,
+# and rel URLs do not work for file://nothing
+env | sort
+MY_BRANCH=${bamboo_planRepository_branch}
+#git submodule foreach git pull origin ${MY_BRANCH}
+git submodule foreach git checkout ${MY_BRANCH}
+#git submodule update --init git-sym FALCON-make FALCON-examples
+git submodule
+./travis.sh
+ls -l $HOME/.ccache
+cat $HOME/.ccache/ccache.conf
+date --utc
diff --git a/makefile b/makefile
index d80b0a2..df20994 100644
--- a/makefile
+++ b/makefile
@@ -6,7 +6,6 @@ default:
@echo 'make config-???'
@echo 'make all'
init:
- git submodule update --init
cp -f default-env.sh env.sh
config-edit:
bash ./FALCON-make/config-edit.sh
@@ -22,4 +21,9 @@ install:
test:
${MAKE} -C ./FALCON-make/ $@
+update: # for creating new releases
+ git submodule update --remote
+ git add .
+ log-compares # my own tool
+
.PHONY: init test
diff --git a/pypeFLOW/bamboo_build.sh b/pypeFLOW/bamboo_build.sh
new file mode 100644
index 0000000..fdea7dd
--- /dev/null
+++ b/pypeFLOW/bamboo_build.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+#type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+#module unload git gcc ccache
+#module load git/2.8.3
+#module load gcc/4.9.2
+#module load ccache/3.2.3
+##module load make
+
+set -vx
+#git --version
+#which gcc
+#which g++
+#gcc --version
+## We cannot use /bin/python without /bin/gcc.
+export PATH=/mnt/software/a/anaconda2/4.2.0/bin:$PATH
+which python
+
+mkdir -p LOCAL
+export PYTHONUSERBASE=$(pwd)/LOCAL
+
+#pip -v install --upgrade --user pip
+pip -v install --user .
+
+make pylint
+
+#python setup.py bdist_wheel
+
+nosetests -v --with-xunit --xunit-file=nose.doctest.xml --with-doctest pypeflow/ pwatcher/fs_based.py
diff --git a/pypeFLOW/makefile b/pypeFLOW/makefile
new file mode 100644
index 0000000..583be07
--- /dev/null
+++ b/pypeFLOW/makefile
@@ -0,0 +1,3 @@
+default:
+pylint:
+ pylint --errors-only pypeflow/ pwatcher/
diff --git a/pypeFLOW/pwatcher/blocking.py b/pypeFLOW/pwatcher/blocking.py
index 57a53c8..ae3e380 100755
--- a/pypeFLOW/pwatcher/blocking.py
+++ b/pypeFLOW/pwatcher/blocking.py
@@ -97,8 +97,8 @@ class State(object):
jobid2status[jobid] = status
def get_running_jobids(self):
return list(self.jobids_submitted)
- def serialize(state):
- return pprint.pformat(state.top)
+ def serialize(self):
+ return pprint.pformat(self.top)
@staticmethod
def deserialize(directory, content):
state = State(directory)
diff --git a/pypeFLOW/pwatcher/fs_based.py b/pypeFLOW/pwatcher/fs_based.py
index 1a203a8..0277d9a 100755
--- a/pypeFLOW/pwatcher/fs_based.py
+++ b/pypeFLOW/pwatcher/fs_based.py
@@ -128,8 +128,8 @@ class State(object):
return {jobid: bjob.mjob for jobid, bjob in self.top['jobs'].iteritems()}
def add_deleted_jobid(self, jobid):
self.top['jobids_deleted'].append(jobid)
- def serialize(state):
- return pprint.pformat(state.top)
+ def serialize(self):
+ return pprint.pformat(self.top)
@staticmethod
def deserialize(directory, content):
state = State(directory)
@@ -235,16 +235,18 @@ def background(script, exe='/bin/bash'):
#system(checkcall, checked=True)
return pid
-def qstripped(option):
+def qstripped(option, flag='-q'):
"""Given a string of options, remove any -q foo.
>>> qstripped('-xy -q foo -z bar')
'-xy -z bar'
+ >>> qstripped('-xy -p foo -z bar', '-p')
+ '-xy -z bar'
"""
# For now, do not strip -qfoo
vals = option.strip().split()
- while '-q' in vals:
- i = vals.index('-q')
+ while flag in vals:
+ i = vals.index(flag)
vals = vals[0:i] + vals[i+2:]
return ' '.join(vals)
@@ -285,10 +287,10 @@ class MetaJobSge(object):
specific = self.specific
#cwd = os.getcwd()
job_name = self.get_jobname()
- sge_option = qstripped(self.mjob.job.options['sge_option'])
- if '-q' not in sge_option:
- job_queue = self.mjob.job.options['job_queue']
- sge_option = '-q {} '.format(job_queue) + sge_option
+ sge_option = self.mjob.job.options['sge_option']
+ job_queue = self.mjob.job.options['job_queue']
+ if job_queue:
+ sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
# Add shebang, in case shell_start_mode=unix_behavior.
# https://github.com/PacificBiosciences/FALCON/pull/348
with open(script_fn, 'r') as original: data = original.read()
@@ -331,10 +333,10 @@ usage: qsub [-a date_time] [-A account_string] [-c interval]
specific = self.specific
#cwd = os.getcwd()
job_name = self.get_jobname()
- sge_option = qstripped(self.mjob.job.options['sge_option'])
- if '-q' not in sge_option:
- job_queue = self.mjob.job.options['job_queue']
- sge_option = '-q {} '.format(job_queue) + sge_option
+ sge_option = self.mjob.job.options['sge_option']
+ job_queue = self.mjob.job.options['job_queue']
+ if job_queue:
+ sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
# Add shebang, in case shell_start_mode=unix_behavior.
# https://github.com/PacificBiosciences/FALCON/pull/348
with open(script_fn, 'r') as original: data = original.read()
@@ -370,10 +372,10 @@ class MetaJobTorque(object):
specific = self.specific
#cwd = os.getcwd()
job_name = self.get_jobname()
- sge_option = qstripped(self.mjob.job.options['sge_option'])
- if '-q' not in sge_option:
- job_queue = self.mjob.job.options['job_queue']
- sge_option = '-q {} '.format(job_queue) + sge_option
+ sge_option = self.mjob.job.options['sge_option']
+ job_queue = self.mjob.job.options['job_queue']
+ if job_queue:
+ sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
cwd = os.getcwd()
# Add shebang, in case shell_start_mode=unix_behavior.
# https://github.com/PacificBiosciences/FALCON/pull/348
@@ -407,10 +409,10 @@ class MetaJobSlurm(object):
"""Can raise.
"""
job_name = self.get_jobname()
- sge_option = qstripped(self.mjob.job.options['sge_option'])
- if '-p' not in sge_option:
- job_queue = self.mjob.job.options['job_queue']
- sge_option = '-p {} '.format(job_queue) + sge_option
+ sge_option = self.mjob.job.options['sge_option']
+ job_queue = self.mjob.job.options['job_queue']
+ if job_queue:
+ sge_option = '-p {} '.format(job_queue) + qstripped(sge_option, '-p')
cwd = os.getcwd()
sge_cmd = 'sbatch -J {job_name} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
**locals())
@@ -441,10 +443,10 @@ class MetaJobLsf(object):
"""Can raise.
"""
job_name = self.get_jobname()
- sge_option = qstripped(self.mjob.job.options['sge_option'])
- if '-q' not in sge_option:
- job_queue = self.mjob.job.options['job_queue']
- sge_option = '-q {} '.format(job_queue) + sge_option
+ sge_option = self.mjob.job.options['sge_option']
+ job_queue = self.mjob.job.options['job_queue']
+ if job_queue:
+ sge_option = '-q {} '.format(job_queue) + qstripped(sge_option)
sge_cmd = 'bsub -J {job_name} {sge_option} -o stdout -e stderr "{exe} {script_fn}"'.format(
**locals())
# "Sets the user's execution environment for the job, including the current working directory, file creation mask, and all environment variables, and sets LSF environment variables before starting the job."
@@ -657,7 +659,7 @@ def delete_heartbeat(state, heartbeat, keep=False):
try:
bjob = state.get_bjob(jobid)
except Exception:
- log.exception('In delete_heartbeat(), unable to find batchjob for % (from %s)' %(jobid, heartbeat))
+ log.exception('In delete_heartbeat(), unable to find batchjob for %s (from %s)' %(jobid, heartbeat))
log.warning('Cannot delete. You might be able to delete this yourself if you examine the content of %s.' %heartbeat_fn)
# TODO: Maybe provide a default grid type, so we can attempt to delete anyway?
return
diff --git a/pypeFLOW/pwatcher/mains/pypeflow_example.py b/pypeFLOW/pwatcher/mains/pypeflow_example.py
index b71cc03..aee0d2b 100644
--- a/pypeFLOW/pwatcher/mains/pypeflow_example.py
+++ b/pypeFLOW/pwatcher/mains/pypeflow_example.py
@@ -1,6 +1,5 @@
-from pypeflow.pwatcher_bridge import PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase
-from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
-from pypeflow.task import PypeTask
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+ makePypeLocalFile, fn, PypeTask)
import json
import logging.config
import os
@@ -102,10 +101,9 @@ def main():
JOB_TYPE, SLEEP_S))
exitOnFailure=False
concurrent_jobs=2
- #Workflow = pypeflow.controller.PypeThreadWorkflow
Workflow = PypeProcWatcherWorkflow
- Workflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
wf = Workflow(job_type=JOB_TYPE)
+ wf.max_jobs = concurrent_jobs
par = dict(sleep_s=SLEEP_S)
DIR ='mytmp'
@@ -113,17 +111,17 @@ def main():
f0 = makePypeLocalFile('mytmp/f0')
f1 = makePypeLocalFile('mytmp/f1')
make_task = PypeTask(
- #inputs = {'f': f},
+ inputs = {},
outputs = {'f0': f0},
parameters = par,
- TaskType = MyFakePypeThreadTaskBase)
+ )
task = make_task(taskrun0)
wf.addTasks([task])
make_task = PypeTask(
inputs = {'f0': f0},
outputs = {'f1': f1},
parameters = par,
- TaskType = MyFakePypeThreadTaskBase)
+ )
task = make_task(taskrun1)
wf.addTasks([task])
wf.refreshTargets([task])
diff --git a/pypeFLOW/pwatcher/mains/query_server.py b/pypeFLOW/pwatcher/mains/query_server.py
index 94604d5..d642730 100755
--- a/pypeFLOW/pwatcher/mains/query_server.py
+++ b/pypeFLOW/pwatcher/mains/query_server.py
@@ -65,8 +65,7 @@ def find_server(args):
if args.sf:
i += 1
if i > 1:
- print('Error: may only specify server once')
- parser.print_usage()
+ raise Exception('Error: may only specify server once. Try "--help".')
return
if args.sf:
if os.path.exists(args.sf):
diff --git a/pypeFLOW/pwatcher/network_based.py b/pypeFLOW/pwatcher/network_based.py
index 803431e..e16b185 100755
--- a/pypeFLOW/pwatcher/network_based.py
+++ b/pypeFLOW/pwatcher/network_based.py
@@ -241,7 +241,7 @@ def start_server(server_directories, hostname='', port=0):
# set daemon to make sure server shuts down when main program finishes
hb_thread.daemon = True
hb_thread.start()
- log.debug('server ({}, {}) alive?'.format(hostname, port, hb_thread.is_alive()))
+ log.debug('server ({}, {}) alive? {}'.format(hostname, port, hb_thread.is_alive()))
return (hb_thread.authkey, (hostname, port))
class MetaJobClass(object):
@@ -324,7 +324,7 @@ class State(object):
self.top['auth'], self.top['server'] = start_server(self.get_server_directories(), old_hostname, old_port)
except StandardError:
self.top['auth'], self.top['server'] = start_server(self.get_server_directories())
- self__.changed = True
+ self.__changed = True
# if we restarted, orphaned jobs might have left exit files
# update the server with exit info
def cleanup_exits(self):
@@ -335,9 +335,9 @@ class State(object):
rc = f.readline().strip()
hsocket = socket.socket()
hsocket.connect(self.get_heartbeat_server())
- socket_send(hsocket, 'e {} {}'.format(jobid, rc))
+ #socket_send(hsocket, 'e {} {}'.format(jobid, rc)) #TODO: Must get jobid from somewhere
hsocket.close()
- os.remove(fn)
+ os.remove(exit_fn)
else:
makedirs(self.get_directory_exits())
def restore_from_save(self, state_fn):
@@ -488,11 +488,11 @@ class MetaJobLocal(object):
hsocket = socket.socket()
try:
hsocket.connect(state.get_heartbeat_server())
- socket_send(hsocket, 'P {}'.format(self.mj.job.jobid))
+ socket_send(hsocket, 'P {}'.format(self.mjob.job.jobid))
line = socket_read(hsocket)
hsocket.close()
except IOError as e:
- log.exception('Failed to get pig/pgid for {}: {!r}'.format(self.mj.job.jobid, e))
+ log.exception('Failed to get pig/pgid for {}: {!r}'.format(self.mjob.job.jobid, e))
return
args = line.split(None, 2)
pid = int(args[0])
@@ -502,7 +502,7 @@ class MetaJobLocal(object):
try:
os.kill(-pgid, sig)
except Exception:
- log.exception('Failed to kill(%s) pgid=-%s for %r. Trying pid=%s' %(sig, pgid, self.mj.job.jobid, pid))
+ log.exception('Failed to kill(%s) pgid=-%s for %r. Trying pid=%s' %(sig, pgid, self.mjob.job.jobid, pid))
os.kill(pid, sig)
def __repr__(self):
return 'MetaJobLocal(%s)' %repr(self.mjob)
@@ -617,6 +617,7 @@ class MetaJobTorque(object):
def __init__(self, mjob):
super(MetaJobTorque, self).__init__(mjob)
self.specific = '-V' # pass enV; '-j oe' => combine out/err
+ self.mjob = mjob
class MetaJobSlurm(object):
def submit(self, state, exe, script_fn):
"""Can raise.
@@ -831,7 +832,7 @@ def delete_jobid(state, jobid, keep=False):
try:
bjob = state.get_bjob(jobid)
except Exception:
- log.exception('In delete_jobid(), unable to find batchjob for %' %(jobid))
+ log.exception('In delete_jobid(), unable to find batchjob for %s' %(jobid))
# TODO: Maybe provide a default grid type, so we can attempt to delete anyway?
return
try:
diff --git a/pypeFLOW/pypeflow/do_task.py b/pypeFLOW/pypeflow/do_task.py
index 1366059..3898dfb 100644
--- a/pypeFLOW/pypeflow/do_task.py
+++ b/pypeFLOW/pypeflow/do_task.py
@@ -1,7 +1,6 @@
#!/usr/bin/env python2.7
from . import do_support, util
import argparse
-import contextlib
import importlib
import inspect
import json
@@ -58,29 +57,6 @@ def get_parser():
help='JSON file, as per epilog.')
return parser
- at contextlib.contextmanager
-def cd(newdir):
- prevdir = os.getcwd()
- LOG.debug('CD: %r <- %r' %(newdir, prevdir))
- os.chdir(os.path.expanduser(newdir))
- try:
- yield
- finally:
- LOG.debug('CD: %r -> %r' %(newdir, prevdir))
- os.chdir(prevdir)
-
-def mkdirs(path):
- if not os.path.isdir(path):
- cmd = 'mkdir -p {}'.format(path)
- util.system(cmd)
-def rmdirs(path):
- if os.path.isdir(path):
- if len(path) < 20 and 'home' in path:
- LOG.error('Refusing to rm {!r} since it might be your homedir.'.format(path))
- return
- cmd = 'rm -rf {}'.format(path)
- util.system(cmd)
-
def wait_for(fn):
global TIMEOUT
LOG.debug('Checking existence of {!r} with timeout={}'.format(fn, TIMEOUT))
@@ -132,7 +108,7 @@ def run(json_fn, timeout, tmpdir):
cfg = json.loads(open(json_fn).read())
LOG.debug(pprint.pformat(cfg))
rundir = os.path.dirname(json_fn)
- with cd(rundir):
+ with util.cd(rundir):
run_cfg_in_tmpdir(cfg, tmpdir)
def run_cfg_in_tmpdir(cfg, tmpdir):
for fn in cfg['inputs'].values():
@@ -150,8 +126,8 @@ def run_cfg_in_tmpdir(cfg, tmpdir):
user = getpass.getuser()
pid = os.getpid()
myrundir = '{tmpdir}/{user}/pypetmp/{finaloutdir}'.format(**locals())
- rmdirs(myrundir)
- mkdirs(myrundir)
+ util.rmdirs(myrundir)
+ util.mkdirs(myrundir)
# TODO(CD): Copy inputs w/ flock.
else:
myrundir = finaloutdir
diff --git a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
index 5dc4c60..40d3c7e 100644
--- a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
+++ b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
@@ -534,7 +534,7 @@ class _PypeTask(object):
for k,v in self.outputs.iteritems():
assert os.path.isabs(v.path), 'For {!r}, output {!r} is not absolute'.format(self.wdir, v)
common = set(self.inputs.keys()) & set(self.outputs.keys())
- assert (not common), 'Keys in both inputs and outputs of PypeTask({}): {!r}'.format(wdir, common)
+ assert (not common), 'Keys in both inputs and outputs of PypeTask({}): {!r}'.format(self.wdir, common)
def __call__(self, func):
self.func = func
self.func_name = '{}.{}'.format(func.__module__, func.__name__)
diff --git a/pypeFLOW/pypeflow/util.py b/pypeFLOW/pypeflow/util.py
index 1b3e13f..5d26de5 100644
--- a/pypeFLOW/pypeflow/util.py
+++ b/pypeFLOW/pypeflow/util.py
@@ -21,6 +21,13 @@ def run(script_fn):
def mkdirs(path):
if not os.path.isdir(path):
os.makedirs(path)
+def rmdirs(path):
+ if os.path.isdir(path):
+ if len(path) < 20 and 'home' in path:
+ LOG.error('Refusing to rm {!r} since it might be your homedir.'.format(path))
+ return
+ cmd = 'rm -rf {}'.format(path)
+ system(cmd)
def system(cmd):
LOG.info(cmd)
rc = os.system(cmd)
diff --git a/travis.sh b/travis.sh
index 439a78e..4e308a4 100755
--- a/travis.sh
+++ b/travis.sh
@@ -5,10 +5,17 @@
set -vex
#env | sort
-#sudo pip install virtualenv
time date # sanity check, since we use 'time' and 'date' in our scripts
-make init # Travis pulls submodules for us, but not --recursive
+
+#git submodule update --init
+# In Bamboo, we do not want this script to alter submodules,
+# since we do that inside Bamboo.
+
+# Note: Travis pulls submodules for us, but not --recursive. But we no longer need that.
+
+make init
source env.sh
+#sudo pip install virtualenv # No! Prefer PYTHONUSERBASE.
make config-edit-user
make -j all
make test
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falcon.git
More information about the debian-med-commit
mailing list