[med-svn] [falcon] 01/05: Imported Upstream version 1.8.4
Afif Elghraoui
afif at moszumanska.debian.org
Fri Dec 2 08:49:43 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository falcon.
commit 54d9a95edd5ecc224ac237e62d79844ee1524ef6
Author: Afif Elghraoui <afif at debian.org>
Date: Fri Dec 2 00:12:48 2016 -0800
Imported Upstream version 1.8.4
---
DALIGNER/LAmerge.c | 2 +-
DAZZ_DB/GNUmakefile | 2 +-
FALCON/falcon_kit/mains/get_read_ctg_map.py | 12 +-
FALCON/falcon_kit/mains/run1.py | 50 +-
FALCON_unzip/LICENSE.txt | 32 +
FALCON_unzip/README.md | 1 +
FALCON_unzip/examples/fc_unzip.cfg | 17 +
FALCON_unzip/examples/unzip.sh | 2 +
FALCON_unzip/examples/unzip_obsoleted.sh | 33 +
FALCON_unzip/falcon_unzip/__init__.py | 36 +
FALCON_unzip/falcon_unzip/dedup_h_tigs.py | 70 +
FALCON_unzip/falcon_unzip/get_read_hctg_map.py | 110 ++
FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py | 678 +++++++++
.../falcon_unzip/ovlp_filter_with_phase.py | 354 +++++
FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py | 1545 ++++++++++++++++++++
FALCON_unzip/falcon_unzip/phasing.py | 569 +++++++
FALCON_unzip/falcon_unzip/phasing_readmap.py | 73 +
FALCON_unzip/falcon_unzip/rr_hctg_track.py | 201 +++
FALCON_unzip/falcon_unzip/run_quiver.py | 394 +++++
FALCON_unzip/falcon_unzip/select_reads_from_bam.py | 113 ++
FALCON_unzip/falcon_unzip/unzip.py | 357 +++++
FALCON_unzip/setup.py | 24 +
FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py | 4 +
.../src/py_scripts/fc_get_read_hctg_map.py | 4 +
FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py | 4 +
.../src/py_scripts/fc_ovlp_filter_with_phase.py | 6 +
.../src/py_scripts/fc_phased_ovlp_to_graph.py | 5 +
FALCON_unzip/src/py_scripts/fc_phasing.py | 5 +
FALCON_unzip/src/py_scripts/fc_phasing_readmap.py | 4 +
FALCON_unzip/src/py_scripts/fc_quiver.py | 4 +
FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py | 4 +
.../src/py_scripts/fc_select_reads_from_bam.py | 4 +
.../src/py_scripts/fc_track_reads_htigs0.py | 398 +++++
FALCON_unzip/src/py_scripts/fc_unzip.py | 4 +
FALCON_unzip/src/py_utils/align.sh | 10 +
FALCON_unzip/src/py_utils/call_SV_from_mum.py | 85 ++
FALCON_unzip/src/py_utils/fetch_read.py | 12 +
FALCON_unzip/src/py_utils/split_bam.py | 28 +
pypeFLOW/pwatcher/fs_based.py | 2 +-
pypeFLOW/pwatcher/network_based.py | 2 +-
pypeFLOW/pypeflow/simple_pwatcher_bridge.py | 10 +-
41 files changed, 5213 insertions(+), 57 deletions(-)
diff --git a/DALIGNER/LAmerge.c b/DALIGNER/LAmerge.c
index 6c1f3dc..ef443cf 100644
--- a/DALIGNER/LAmerge.c
+++ b/DALIGNER/LAmerge.c
@@ -168,7 +168,7 @@ static void ovl_reload(IO_block *in, int64 bsize)
remains = in->top - in->ptr;
if (remains > 0)
- memcpy(in->block, in->ptr, remains);
+ memmove(in->block, in->ptr, remains);
in->ptr = in->block;
in->top = in->block + remains;
in->top += fread(in->top,1,bsize-remains,in->stream);
diff --git a/DAZZ_DB/GNUmakefile b/DAZZ_DB/GNUmakefile
index de52eb3..2c11aff 100644
--- a/DAZZ_DB/GNUmakefile
+++ b/DAZZ_DB/GNUmakefile
@@ -36,6 +36,6 @@ clean:
rm -f DBupgrade.Sep.25.2014 DBupgrade.Dec.31.2014 DUSTupgrade.Jan.1.2015
rm -f dazz.db.tar.gz
-SRCS:=$(notdir $(wildcard ${THISDIR}/*.c))
+SRCS:=DB.c QV.c
DEPS:=$(patsubst %.c,%.d,${SRCS})
-include ${DEPS}
diff --git a/FALCON/falcon_kit/mains/get_read_ctg_map.py b/FALCON/falcon_kit/mains/get_read_ctg_map.py
index 349322f..91470ff 100644
--- a/FALCON/falcon_kit/mains/get_read_ctg_map.py
+++ b/FALCON/falcon_kit/mains/get_read_ctg_map.py
@@ -1,7 +1,4 @@
from __future__ import absolute_import
-#from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
-#from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
-#from pypeflow.controller import PypeWorkflow, PypeMPWorkflow, PypeThreadWorkflow
from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
makePypeLocalFile, fn, PypeTask)
PypeThreadTaskBase = MyFakePypeThreadTaskBase
@@ -84,8 +81,7 @@ def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
task = PypeTask(
inputs = {'rawread_db': rawread_db},
outputs = {'rawread_id_file': rawread_id_file},
- TaskType = PypeThreadTaskBase,
- URL = 'task://localhost/dump_rawread_ids')
+ )
wf.addTask(task(dump_rawread_ids))
pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
@@ -94,8 +90,7 @@ def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
task = PypeTask(
inputs = {'pread_db': pread_db},
outputs = {'pread_id_file': pread_id_file},
- TaskType = PypeThreadTaskBase,
- URL = 'task://localhost/dump_pread_ids' )
+ )
wf.addTask(task(dump_pread_ids))
wf.refreshTargets() # block
@@ -115,8 +110,7 @@ def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
task = PypeTask(
inputs = inputs,
outputs = {'read_to_contig_map': read_to_contig_map},
- TaskType = PypeThreadTaskBase,
- URL = 'task://localhost/get_ctg_read_map')
+ )
wf.addTask(task(generate_read_to_ctg_map))
wf.refreshTargets() # block
diff --git a/FALCON/falcon_kit/mains/run1.py b/FALCON/falcon_kit/mains/run1.py
index 8fd7352..d7bb13d 100644
--- a/FALCON/falcon_kit/mains/run1.py
+++ b/FALCON/falcon_kit/mains/run1.py
@@ -1,9 +1,6 @@
from .. import run_support as support
from .. import bash, pype_tasks
from ..util.system import only_these_symlinks
-#from pypeflow.pwatcher_bridge import PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase
-#from pypeflow.data import makePypeLocalFile, fn
-#from pypeflow.task import PypeTask
from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
makePypeLocalFile, fn, PypeTask)
import argparse
@@ -34,8 +31,6 @@ def create_daligner_tasks(basedir, scatter_fn):
make_daligner_task = PypeTask(inputs = inputs,
outputs = outputs,
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase,
- URL = URL,
wdir = wdir,
)
daligner_task = make_daligner_task(pype_tasks.task_run_daligner)
@@ -60,8 +55,6 @@ def create_merge_tasks(basedir, scatter_fn):
make_task = PypeTask(inputs = inputs,
outputs = outputs,
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase,
- URL = URL,
wdir = wdir,
)
task = make_task(pype_tasks.task_run_las_merge)
@@ -86,8 +79,6 @@ def create_consensus_tasks(basedir, scatter_fn):
make_c_task = PypeTask(inputs = inputs,
outputs = outputs,
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase,
- URL = URL,
wdir = wdir,
)
c_task = make_c_task(pype_tasks.task_run_consensus)
@@ -103,9 +94,7 @@ def create_merge_gather_task(wd, inputs):
outputs = {'las_fofn': las_fofn_plf,
'las_fopfn': las_fopfn_plf,
},
- TaskType = MyFakePypeThreadTaskBase,
)
- # URL = 'task://localhost/pmerge_gather')
task = make_task(pype_tasks.task_merge_gather)
return task, las_fofn_plf, las_fopfn_plf
@@ -116,8 +105,7 @@ def create_consensus_gather_task(wd, inputs):
make_cns_gather_task = PypeTask(
inputs = inputs, # consensus_out
outputs = {'preads_fofn': preads_fofn_plf},
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/cns_gather' )
+ )
task = make_cns_gather_task(pype_tasks.task_cns_gather)
return task, preads_fofn_plf
@@ -170,7 +158,7 @@ def run(wf, config,
make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
outputs = {'o_fofn': rawread_fofn_plf},
parameters = {},
- TaskType = MyFakePypeThreadTaskBase)
+ )
fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)
wf.addTasks([fofn_abs_task])
@@ -195,7 +183,7 @@ def run(wf, config,
'run_jobs': run_jobs,
},
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase)
+ )
build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb)
wf.addTasks([build_rdb_task])
@@ -218,8 +206,6 @@ def run(wf, config,
'pread_aln': False,
'config': config,
},
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/raw-daligner-scatter'
)
task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
wf.addTask(task)
@@ -237,8 +223,7 @@ def run(wf, config,
inputs = daligner_out,
outputs = {'gathered': r_gathered_las_plf},
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/rda_check' )
+ )
check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
wf.addTask(check_r_da_task)
wf.refreshTargets(exitOnFailure=exitOnFailure)
@@ -257,8 +242,6 @@ def run(wf, config,
'db_prefix': 'raw_reads',
'config': config,
},
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/raw-merge-scatter'
)
task = make_task(pype_tasks.task_merge_scatter)
wf.addTask(task)
@@ -286,8 +269,6 @@ def run(wf, config,
'db_prefix': 'raw_reads',
'config': config,
},
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/raw-cns-scatter'
)
task = make_task(pype_tasks.task_consensus_scatter)
wf.addTask(task)
@@ -310,8 +291,7 @@ def run(wf, config,
'preads_fofn': preads_fofn_plf, },
outputs = {'pre_assembly_report': pre_assembly_report_plf, },
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/report_pre_assembly')
+ )
task = make_task(pype_tasks.task_report_pre_assembly)
wf.addTask(task)
@@ -330,7 +310,7 @@ def run(wf, config,
make_fofn_abs_task = PypeTask(inputs = {'i_fofn': rawread_fofn_plf},
outputs = {'o_fofn': preads_fofn_plf},
parameters = {},
- TaskType = MyFakePypeThreadTaskBase)
+ )
fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_preads)
wf.addTasks([fofn_abs_task])
wf.refreshTargets([fofn_abs_task])
@@ -349,8 +329,7 @@ def run(wf, config,
'run_jobs': run_jobs,
},
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/build_pdb')
+ )
build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb)
wf.addTasks([build_pdb_task])
@@ -376,8 +355,6 @@ def run(wf, config,
'pread_aln': True,
'config': config,
},
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/preads-daligner-scatter'
)
task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
wf.addTask(task)
@@ -394,8 +371,7 @@ def run(wf, config,
inputs = daligner_out,
outputs = {'gathered': p_gathered_las_plf},
parameters = parameters,
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/pda_check' )
+ )
check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
wf.addTask(check_p_da_task)
wf.refreshTargets(exitOnFailure=exitOnFailure)
@@ -415,9 +391,7 @@ def run(wf, config,
'db_prefix': 'preads',
'config': config,
},
- TaskType = MyFakePypeThreadTaskBase,
- #URL = 'task://localhost/preads-merge-scatter'
- )
+ )
task = make_task(pype_tasks.task_merge_scatter)
wf.addTask(task)
wf.refreshTargets(exitOnFailure=exitOnFailure)
@@ -447,8 +421,7 @@ def run(wf, config,
'config': config,
'sge_option': config['sge_option_fc'],
},
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/db2falcon' )
+ )
wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon))
falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, 'falcon_asm_done'))
@@ -463,8 +436,7 @@ def run(wf, config,
'pread_dir': pread_dir,
'sge_option': config['sge_option_fc'],
},
- TaskType = MyFakePypeThreadTaskBase,
- URL = 'task://localhost/falcon_asm' )
+ )
wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm))
wf.refreshTargets()
diff --git a/FALCON_unzip/LICENSE.txt b/FALCON_unzip/LICENSE.txt
new file mode 100644
index 0000000..191f795
--- /dev/null
+++ b/FALCON_unzip/LICENSE.txt
@@ -0,0 +1,32 @@
+The Clear BSD License
+
+Copyright (c) 2016, Pacific Biosciences
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the disclaimer
+below) provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific
+ prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS
+LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
diff --git a/FALCON_unzip/README.md b/FALCON_unzip/README.md
new file mode 100644
index 0000000..fd5191c
--- /dev/null
+++ b/FALCON_unzip/README.md
@@ -0,0 +1 @@
+Please refer to https://github.com/PacificBiosciences/FALCON_unzip/wiki
diff --git a/FALCON_unzip/examples/fc_unzip.cfg b/FALCON_unzip/examples/fc_unzip.cfg
new file mode 100644
index 0000000..787360c
--- /dev/null
+++ b/FALCON_unzip/examples/fc_unzip.cfg
@@ -0,0 +1,17 @@
+[General]
+job_type = SGE
+
+[Unzip]
+input_fofn= input.fofn
+input_bam_fofn= input_bam.fofn
+
+smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
+
+jobqueue = your_sge_queue
+sge_phasing= -pe smp 12 -q %(jobqueue)s
+sge_quiver= -pe smp 12 -q %(jobqueue)s
+sge_track_reads= -pe smp 12 -q %(jobqueue)s
+sge_blasr_aln= -pe smp 24 -q %(jobqueue)s
+sge_hasm= -pe smp 48 -q %(jobqueue)s
+unzip_concurrent_jobs = 64
+quiver_concurrent_jobs = 64
diff --git a/FALCON_unzip/examples/unzip.sh b/FALCON_unzip/examples/unzip.sh
new file mode 100644
index 0000000..529e368
--- /dev/null
+++ b/FALCON_unzip/examples/unzip.sh
@@ -0,0 +1,2 @@
+fc_unzip.py fc_unzip.cfg
+fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/examples/unzip_obsoleted.sh b/FALCON_unzip/examples/unzip_obsoleted.sh
new file mode 100644
index 0000000..88d22da
--- /dev/null
+++ b/FALCON_unzip/examples/unzip_obsoleted.sh
@@ -0,0 +1,33 @@
+# identify raw reads to each contig
+fc_track_reads.py
+# get the reads assigned to each contig from the initial *.subreads.fasta files
+mkdir -p 3-unzip/reads/
+fc_fetch_reads.py
+# phasing the read
+fc_unzip.py fc_unzip.cfg
+# doing the haplotig assembly, this part will be intergrated into fc_unzip.py
+mkdir -p 3-unzip/1-hasm/
+cd 3-unzip/1-hasm/
+find ../0-phasing/ -name "rid_to_phase.*" | xargs cat > rid_to_phase.all
+fc_ovlp_filter_with_phase.py --fofn ../../2-asm-falcon/las.fofn --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map ./rid_to_phase.all > preads.p_ovl
+fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
+fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map ./rid_to_phase.all --fasta ../../1-preads_ovl/preads4falcon.fasta
+WD=$PWD
+for f in `cat ../reads/ctg_list `;do cd $WD/$f; fc_dedup_h_tigs.py $f; done
+# prepare for quviering the haplotig
+cd $WD/..
+rm all_phased_reads all_h_ctg_ids all_h_ctg_edges all_p_ctg_edges all_p_ctg.fa all_h_ctg.fa
+find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
+find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
+find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
+find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
+find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
+find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
+cd ../
+# identify raw reads to each primary contig or haplotig
+fc_track_reads_htigs.py
+# decompose PacBio raw reads with pulse information in bam files into individual bam file for each haplotig or primary contig
+mkdir -p 4-quiver/reads/
+fc_select_reads_from_bam.py input_bam.fofn
+# run blasr and quiver to generate consensus for each haplotig or primary contig
+fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/falcon_unzip/__init__.py b/FALCON_unzip/falcon_unzip/__init__.py
new file mode 100644
index 0000000..94e4fd5
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################$$
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
diff --git a/FALCON_unzip/falcon_unzip/dedup_h_tigs.py b/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
new file mode 100644
index 0000000..61bd8c3
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
@@ -0,0 +1,70 @@
+import os
+import sys
+from falcon_kit.FastaReader import FastaReader
+
+def main(args):
+
+ ctg_id = sys.argv[1]
+
+ if os.path.exists("h_ctg_all.{ctg_id}.fa".format(ctg_id = ctg_id)):
+ os.system("nucmer -mum p_ctg.{ctg_id}.fa h_ctg_all.{ctg_id}.fa -p hp_aln".format(ctg_id = ctg_id))
+ os.system("show-coords -T -H -l -c hp_aln.delta > hp_aln.coor")
+ else:
+ sys.exit(0) #it is ok if there is no h_ctg_all.{ctg_id}.fa, don't want to interupt the workflow
+
+
+ if os.path.exists("hp_aln.coor"):
+ filter_out = set()
+ with open("hp_aln.coor") as f:
+ for row in f:
+ row = row.strip().split()
+ q_cov = float(row[10])
+ idt = float(row[6])
+ if q_cov > 99 and idt > 99:
+ filter_out.add(row[-1])
+
+ p_ctg_to_phase = {}
+ with open("p_ctg_path.%s" % ctg_id) as f:
+ for row in f:
+ row = row.strip().split()
+ b_id, ph_id = (int(row[-2]), int(row[-1]) )
+ p_ctg_to_phase.setdefault( row[0], {} )
+ p_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
+ p_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
+
+
+ h_ctg_to_phase = {}
+ with open("h_ctg_path.%s" % ctg_id) as f:
+ for row in f:
+ row = row.strip().split()
+ b_id, ph_id = (int(row[-2]), int(row[-1]) )
+ h_ctg_to_phase.setdefault( row[0], {} )
+ h_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
+ h_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
+
+ h_ids = open("h_ctg_ids.%s" % ctg_id,"w")
+ with open("h_ctg.%s.fa" % ctg_id, "w") as f:
+ h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id)
+ for r in h_tig_all:
+ p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {})
+
+ if len(r.sequence) < 500:
+ continue
+
+ if r.name in filter_out:
+ edge_count = sum(h_ctg_to_phase[ r.name ].values())
+ same_phase_to_p_ctg_count = 0
+ for b_id, ph_id in h_ctg_to_phase[ r.name ]:
+ if b_id != -1:
+ if (b_id, ph_id) in p_ctg_phase:
+ same_phase_to_p_ctg_count += h_ctg_to_phase[ r.name ][ (b_id, ph_id) ]
+ unphased_edge_count = h_ctg_to_phase[ r.name ] .get( (-1, 0), 0 )
+
+ print r.name, edge_count, unphased_edge_count, same_phase_to_p_ctg_count
+ if edge_count - unphased_edge_count - same_phase_to_p_ctg_count < 5: # there are many non-p_ctg phase segment, do not filter out
+ continue
+
+ print >>f, ">"+r.name
+ print >>f, r.sequence
+ print >> h_ids, r.name
+ h_ids.close()
diff --git a/FALCON_unzip/falcon_unzip/get_read_hctg_map.py b/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
new file mode 100644
index 0000000..18909f2
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
@@ -0,0 +1,110 @@
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+ makePypeLocalFile, fn, PypeTask)
+import argparse
+import logging
+import os
+import sys
+
+def make_dirs(d):
+ if not os.path.isdir(d):
+ os.makedirs(d)
+
+def generate_read_to_hctg_map(self):
+ rawread_id_file = fn( self.rawread_id_file )
+ pread_id_file = fn( self.pread_id_file )
+ read_to_contig_map = fn( self.read_to_contig_map )
+
+ pread_did_to_rid = open(pread_id_file).read().split('\n')
+ rid_to_oid = open(rawread_id_file).read().split('\n')
+
+ h_ctg_edges = fn( self.h_ctg_edges )
+ p_ctg_edges = fn( self.p_ctg_edges )
+
+ h_ctg_ids = set()
+ with open(fn(self.h_ctg_ids)) as f:
+ for row in f:
+ row = row.strip()
+ h_ctg_ids.add( row )
+
+ pread_to_contigs = {}
+
+ for fnanme in ( p_ctg_edges, h_ctg_edges):
+ with open(fnanme) as f:
+ for row in f:
+ row = row.strip().split()
+ ctg = row[0]
+ if len(ctg.split('_')) > 1 and ctg not in h_ctg_ids:
+ continue
+ n1 = row[1]
+ n2 = row[2]
+ pid1 = int(n1.split(':')[0])
+ pid2 = int(n2.split(':')[0])
+ rid1 = pread_did_to_rid[pid1].split('/')[1]
+ rid2 = pread_did_to_rid[pid2].split('/')[1]
+ rid1 = int(int(rid1)/10)
+ rid2 = int(int(rid2)/10)
+ oid1 = rid_to_oid[rid1]
+ oid2 = rid_to_oid[rid2]
+ k1 = (pid1, rid1, oid1)
+ pread_to_contigs.setdefault( k1, set() )
+ pread_to_contigs[ k1 ].add( ctg )
+ k2 = (pid2, rid2, oid2)
+ pread_to_contigs.setdefault( k2, set() )
+ pread_to_contigs[ k2 ].add( ctg )
+
+ with open(read_to_contig_map, 'w') as f:
+ for k in pread_to_contigs:
+ pid, rid, oid = k
+ for ctg in list(pread_to_contigs[ k ]):
+ print >>f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
+
+def get_read_hctg_map(asm_dir, hasm_dir, quiver_dir):
+ wf = PypeProcWatcherWorkflow(
+ max_jobs=12, # TODO: Why was NumThreads ever set? There is only one task!
+ )
+
+ rawread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids'))
+ pread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids'))
+ h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges'))
+ p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges'))
+ h_ctg_ids = makePypeLocalFile( os.path.join(hasm_dir, "all_h_ctg_ids"))
+ read_map_dir = os.path.join(quiver_dir, 'read_maps')
+ make_dirs(read_map_dir)
+
+ read_to_contig_map = makePypeLocalFile(os.path.join(read_map_dir, 'read_to_contig_map'))
+
+ inputs = { 'rawread_id_file': rawread_id_file,
+ 'pread_id_file': pread_id_file,
+ 'h_ctg_edges': h_ctg_edges,
+ 'p_ctg_edges': p_ctg_edges,
+ 'h_ctg_ids': h_ctg_ids}
+
+ make_task = PypeTask(
+ inputs = inputs,
+ outputs = {'read_to_contig_map': read_to_contig_map},
+ )
+ wf.addTask(make_task(generate_read_to_hctg_map))
+ wf.refreshTargets() # block
+
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(description='generate `4-quiver/read_maps/read_to_contig_map` that contains the \
+information from the chain of mapping: (contig id, last col) -> (internal p-read id) -> (internal raw-read id) -> (original read id)\n \
+it assumes the 2-asm-falcon/read_maps ... /dump_rawread_ids/rawread_ids and /dump_pread_ids/pread_ids are already generated',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--basedir', type=str, default="./", help='the base working dir of a FALCON assembly')
+ args = parser.parse_args(argv[1:])
+ return args
+
+def main(argv=sys.argv):
+ logging.basicConfig()
+ args = parse_args(argv)
+ basedir = args.basedir
+ #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
+ #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
+ asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
+ hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
+ quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
+
+ get_read_hctg_map(asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py b/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
new file mode 100644
index 0000000..fe7fd00
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
@@ -0,0 +1,678 @@
+from falcon_kit.fc_asm_graph import AsmGraph
+from falcon_kit.FastaReader import FastaReader
+import os
+import networkx as nx
+from multiprocessing import Pool
+import argparse
+import re
+import sys
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+## for shared memory usage
+global p_asm_G
+global h_asm_G
+global all_rid_to_phase
+global seqs
+
+def mkdir(d):
+ if not os.path.isdir(d):
+ os.makedirs(d)
+
+def reverse_end( node_id ):
+ node_id, end = node_id.split(":")
+ new_end = "B" if end == "E" else "E"
+ return node_id + ":" + new_end
+
+
+def load_sg_seq(all_read_ids, fasta_fn):
+
+ seqs = {}
+ # load all p-read name into memory
+ f = FastaReader(fasta_fn)
+ for r in f:
+ if r.name not in all_read_ids:
+ continue
+ seqs[r.name] = r.sequence.upper()
+ return seqs
+
+def generate_haplotigs_for_ctg(input_):
+
+ ctg_id, out_dir = input_
+ global p_asm_G
+ global h_asm_G
+ global all_rid_to_phase
+ global seqs
+ arid_to_phase = all_rid_to_phase[ctg_id]
+
+ mkdir( out_dir )
+
+ ctg_G = p_asm_G.get_sg_for_ctg(ctg_id)
+
+ ctg_nodes = set(ctg_G.nodes())
+
+ sg = nx.DiGraph()
+
+ for v, w in ctg_G.edges():
+
+ vrid = v[:9]
+ wrid = w[:9]
+
+ edge_data = p_asm_G.sg_edges[ (v, w) ]
+ if edge_data[-1] != "G":
+ continue
+
+ vphase = arid_to_phase.get(vrid, (-1,0))
+ wphase = arid_to_phase.get(wrid, (-1,0))
+ if vphase[0] == wphase[0] and vphase[1] != wphase[1]:
+ cross_phase = "Y"
+ else:
+ cross_phase = "N"
+
+ sg.add_node( v, label= "%d_%d" % vphase,
+ phase="%d_%d" % vphase,
+ src="P" )
+
+ sg.add_node( w, label= "%d_%d" % wphase,
+ phase="%d_%d" % wphase,
+ src="P" )
+
+ sg.add_edge(v, w, src="OP", cross_phase = cross_phase)
+
+ # we need to add the complimentary edges as the ctg_graph does not contain the dual edges
+ rv = reverse_end(v)
+ rw = reverse_end(w)
+ sg.add_node( rv, label= "%d_%d" % vphase,
+ phase="%d_%d" % vphase,
+ src="P" )
+
+ sg.add_node( rw, label= "%d_%d" % wphase,
+ phase="%d_%d" % wphase,
+ src="P" )
+
+ sg.add_edge(rw, rv, src="OP", cross_phase = cross_phase)
+
+ PG_nodes = set(sg.nodes())
+ PG_edges = set(sg.edges())
+
+ for v, w in h_asm_G.sg_edges:
+
+ vrid = v[:9]
+ wrid = w[:9]
+
+ if vrid not in arid_to_phase:
+ continue
+ if wrid not in arid_to_phase:
+ continue
+
+ if (v, w) in PG_edges:
+ if p_asm_G.sg_edges[(v,w)][-1] == "G":
+ continue
+
+ edge_data = h_asm_G.sg_edges[ (v, w) ]
+
+ if edge_data[-1] != "G":
+ continue
+
+ cross_phase = "N"
+ if v not in PG_nodes:
+ sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
+ phase="%d_%d" % arid_to_phase[vrid],
+ src="H" )
+
+ if w not in PG_nodes:
+ sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
+ phase="%d_%d" % arid_to_phase[wrid],
+ src="H" )
+
+ sg.add_edge(v, w, src="H", cross_phase = cross_phase)
+
+ rv = reverse_end(v)
+ rw = reverse_end(w)
+ if rv not in PG_nodes:
+ sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
+ phase="%d_%d" % arid_to_phase[vrid],
+ src="H" )
+
+ if rw not in PG_nodes:
+ sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
+ phase="%d_%d" % arid_to_phase[wrid],
+ src="H" )
+
+ sg.add_edge(rw, rv, src="H", cross_phase = cross_phase)
+
+ sg0 = sg.copy()
+ for v, w in h_asm_G.sg_edges:
+ vrid = v[:9]
+ wrid = w[:9]
+ if vrid not in arid_to_phase:
+ continue
+ if wrid not in arid_to_phase:
+ continue
+
+ if (v, w) in PG_edges:
+ if p_asm_G.sg_edges[(v,w)][-1] == "G":
+ continue
+
+ edge_data = h_asm_G.sg_edges[ (v, w) ]
+
+ if sg0.in_degree(w) == 0:
+ cross_phase = "Y"
+ if v not in PG_nodes:
+ sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
+ phase="%d_%d" % arid_to_phase[vrid],
+ src="H" )
+
+ if w not in PG_nodes:
+ sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
+ phase="%d_%d" % arid_to_phase[wrid],
+ src="H" )
+
+ sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
+
+ rv = reverse_end(v)
+ rw = reverse_end(w)
+ if rv not in PG_nodes:
+ sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
+ phase="%d_%d" % arid_to_phase[vrid],
+ src="H" )
+
+ if rw not in PG_nodes:
+ sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
+ phase="%d_%d" % arid_to_phase[wrid],
+ src="H" )
+
+ sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
+
+ if sg0.out_degree(v) == 0:
+ cross_phase = "Y"
+ if v not in PG_nodes:
+ sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
+ phase="%d_%d" % arid_to_phase[vrid],
+ src="H" )
+
+ if w not in PG_nodes:
+ sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
+ phase="%d_%d" % arid_to_phase[wrid],
+ src="H" )
+
+ sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
+
+ rv = reverse_end(v)
+ rw = reverse_end(w)
+ if rv not in PG_nodes:
+ sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
+ phase="%d_%d" % arid_to_phase[vrid],
+ src="H" )
+
+ if rw not in PG_nodes:
+ sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
+ phase="%d_%d" % arid_to_phase[wrid],
+ src="H" )
+
+ sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
+
+ sg2 = sg.copy()
+ ctg_nodes_r = set([ reverse_end(v) for v in list(ctg_nodes) ])
+ for v, w in ctg_G.edges():
+ sg2.remove_edge(v, w)
+ rv, rw = reverse_end(v), reverse_end(w)
+ sg2.remove_edge(rw, rv)
+ for v in sg2.nodes():
+ if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
+ sg2.remove_node(v)
+
+ nodes_to_remove = set()
+ edges_to_remove = set()
+ for sub_g in nx.weakly_connected_component_subgraphs(sg2):
+ sub_g_nodes = set(sub_g.nodes())
+ if len(sub_g_nodes & ctg_nodes_r) > 0 and len(sub_g_nodes & ctg_nodes) > 0:
+ # remove cross edge
+ sources = [n for n in sub_g.nodes() if sub_g.in_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
+ sinks = [n for n in sub_g.nodes() if sub_g.out_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
+ edges_to_keep = set()
+ for v in sources:
+ for w in sinks:
+ path = []
+ if v in ctg_nodes and w not in ctg_nodes_r:
+ try:
+ path = nx.shortest_path( sub_g, v, w )
+ except nx.exception.NetworkXNoPath:
+ path = []
+ elif v not in ctg_nodes and w in ctg_nodes_r:
+ try:
+ path = nx.shortest_path( sub_g, v, w )
+ except nx.exception.NetworkXNoPath:
+ path = []
+
+ if len(path) >= 2:
+ v1 = path[0]
+ for w1 in path[1:]:
+ edges_to_keep.add( (v1, w1) )
+ rv1, rw1 = reverse_end(v1), reverse_end(w1)
+ edges_to_keep.add( (rw1, rv1) )
+ v1 = w1
+ for v, w in sub_g.edges():
+ if (v, w) not in edges_to_keep:
+ edges_to_remove.add( (v, w) )
+ rv, rw = reverse_end(v), reverse_end(w)
+ edges_to_remove.add( (rw, rv) )
+
+
+ if len(sub_g_nodes & ctg_nodes_r) == 0 and len(sub_g_nodes & ctg_nodes) == 0:
+ nodes_to_remove.update( sub_g_nodes )
+ nodes_to_remove.update( set( [reverse_end(v) for v in list(sub_g_nodes)] ) )
+
+ for v, w in list(edges_to_remove):
+ sg.remove_edge(v, w)
+
+ for v in nodes_to_remove:
+ sg.remove_node(v)
+
+ for v in sg.nodes():
+ if sg.out_degree(v) == 0 and sg.in_degree(v) == 0:
+ sg.remove_node(v)
+
+ #nx.write_gexf(sg, "full_g.gexf")
+
+ s_node = p_asm_G.ctg_data[ctg_id][5][0][0]
+ t_node = p_asm_G.ctg_data[ctg_id][5][-1][-1]
+
+ for v, w in sg.edges():
+ phase0 = sg.node[v]["phase"].split("_")
+ phase1 = sg.node[w]["phase"].split("_")
+ if phase0 == phase1:
+ sg[v][w]["weight"] = 10
+ sg[v][w]["score"] = 1
+ sg[v][w]["label"] = "type0"
+ else:
+ if phase0[0] == phase1[0]:
+ sg[v][w]["weight"] = 1
+ sg[v][w]["score"] = 100000
+ sg[v][w]["label"] = "type1"
+ else:
+ sg[v][w]["weight"] = 5
+ sg[v][w]["score"] = 50
+ sg[v][w]["label"] = "type2"
+
+
+ sg2 = sg.copy()
+ edge_to_remove = set()
+ for v, w in sg2.edges():
+ if sg2[v][w]["src"] == "ext":
+ edge_to_remove.add( (v, w) )
+ rv, rw = reverse_end(v), reverse_end(w)
+ edge_to_remove.add( (rw, rv) )
+
+ if sg2.node[v]["phase"] == sg2.node[w]["phase"]:
+ continue
+ flag1 = 0
+ flag2 = 0
+ for e in sg2.out_edges(v):
+ if sg2.node[e[0]]["phase"] == sg2.node[e[1]]["phase"]:
+ flag1 = 1
+ break
+ if flag1 == 1:
+ for e in sg2.in_edges(w):
+ if sg2.node[e[0]]["phase"] == sg2.node[e[1]]["phase"]:
+ flag2 = 1
+ break
+ if flag2 == 1:
+ edge_to_remove.add( (v, w) )
+ rv, rw = reverse_end(v), reverse_end(w)
+ edge_to_remove.add( (rw, rv) )
+
+ for v, w in list(edge_to_remove):
+ sg2.remove_edge(v, w)
+
+ ## remove some new short cut due to repeats in the haplotype edges
+ edge_to_remove = set()
+ with open(os.path.join(out_dir, "path_len.%s" % ctg_id), "w") as f:
+
+ all_length_from_s = nx.shortest_path_length(sg, source=s_node)
+ edge_to_remove = set()
+
+ for w in all_length_from_s:
+ longest = 0
+ if sg.in_degree(w) >= 2:
+ for e in sg.in_edges(w):
+ v = e[0]
+ if v in all_length_from_s:
+ len_ = all_length_from_s[v]
+ if len_ > longest:
+ longest = len_
+ if longest == 0:
+ continue
+ for e in sg.in_edges(w):
+ v = e[0]
+ if v in all_length_from_s:
+ len_ = all_length_from_s[v]
+ print >>f, ctg_id, "link_lengths", v, w, longest, len_
+ if longest - len_ > 10 and sg[v][w]["src"] != "OP": #make sure we always have one path drived from the original graph from s to t
+ edge_to_remove.add( (v, w) )
+ rv, rw = reverse_end(v), reverse_end(w)
+ edge_to_remove.add( (rw, rv) )
+
+ for v, w in list(edge_to_remove):
+ sg.remove_edge(v, w)
+ print >>f, ctg_id, "removed", v, w
+
+ for v, w in list(edge_to_remove):
+ try:
+ sg2.remove_edge(v, w)
+ except Exception:
+ pass
+
+
+ nx.write_gexf(sg, os.path.join(out_dir, "sg.gexf" ))
+ nx.write_gexf(sg2, os.path.join(out_dir, "sg2.gexf" ))
+
+
+ try:
+ s_path = nx.shortest_path(sg2, source=s_node, target=t_node, weight="score")
+ except nx.exception.NetworkXNoPath:
+ s_path = nx.shortest_path(sg, source=s_node, target=t_node, weight="score")
+
+ s_path_edges = []
+ for i in xrange(len(s_path)-1):
+ v = s_path[i]
+ w = s_path[i+1]
+ sg[v][w]["weight"] = 15
+ s_path_edges.append( (v,w) )
+
+ s_path_edge_set = set(s_path_edges)
+
+
+
+ #output the updated primary contig
+ p_tig_path = open(os.path.join(out_dir, "p_ctg_path.%s" % ctg_id),"w")
+ p_tig_fa = open(os.path.join(out_dir, "p_ctg.%s.fa" % ctg_id),"w")
+ edges_to_remove1 = set()
+ edges_to_remove2 = set()
+ with open(os.path.join(out_dir, "p_ctg_edges.%s" % ctg_id), "w") as f:
+ seq = []
+ for v, w in s_path_edges:
+ sg[v][w]["h_edge"] = 1
+ vrid = v.split(":")[0]
+ wrid = w.split(":")[0]
+ vphase = arid_to_phase.get(vrid, (-1,0))
+ wphase = arid_to_phase.get(wrid, (-1,0))
+ print >>f, "%s" % ctg_id, v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
+
+ if sg.edge[v][w]["src"] == "OP":
+ edge_data = p_asm_G.sg_edges[ (v,w) ]
+ else:
+ edge_data = h_asm_G.sg_edges[ (v,w) ]
+
+ seq_id, s, t = edge_data[0]
+ if s < t:
+ seq.append(seqs[ seq_id ][ s:t ])
+ else:
+ seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ s:t:-1 ] ]))
+ print >>p_tig_path, "%s" % ctg_id, v, w, seq_id, s, t, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
+ sg[v][w]["tig_id"] = "%s" % ctg_id
+
+ rv, rw = reverse_end(v), reverse_end(w)
+ edges_to_remove1.add( (v, w) )
+ edges_to_remove2.add( (rw, rv) )
+
+ print >> p_tig_fa, ">%s" % ctg_id
+ print >> p_tig_fa, "".join(seq)
+
+ p_tig_fa.close()
+ p_tig_path.close()
+
+
+
+ sg2 = sg.copy()
+ reachable1 = nx.descendants(sg2, s_node)
+ sg2_r = sg2.reverse()
+ reachable2 = nx.descendants(sg2_r, t_node)
+
+ reachable_all = reachable1 | reachable2
+ reachable_both = reachable1 & reachable2
+
+
+ for v, w in list(edges_to_remove2 | edges_to_remove1):
+ sg2.remove_edge( v, w )
+
+ for v, w in sg2.edges():
+ if sg2[v][w]["cross_phase"] == "Y":
+ sg2.remove_edge( v, w )
+
+ for v in sg2.nodes():
+ if v not in reachable_all:
+ sg2.remove_node(v)
+
+ for v in sg2.nodes():
+ if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
+ sg2.remove_node(v)
+ continue
+ if v in reachable_both:
+ sg2.node[v]["reachable"] = 1
+ else:
+ sg2.node[v]["reachable"] = 0
+
+ dump_graph = False # the code segement below is useful for showing the graph
+ if dump_graph == True:
+ nx.write_gexf(sg2, "%s_1.gexf" % ctg_id)
+
+ p_path_nodes = set(s_path)
+ p_path_rc_nodes = set( [reverse_end(v) for v in s_path] )
+
+ sg2_nodes = set(sg2.nodes())
+ for v in p_asm_G.get_sg_for_ctg(ctg_id).nodes():
+ rv = reverse_end(v)
+ p_path_rc_nodes.add( rv )
+ if rv in sg2_nodes:
+ sg2.remove_node(rv)
+
+
+ h_tig_path = open(os.path.join(out_dir, "h_ctg_path.%s" % ctg_id),"w")
+ h_tig_fa = open(os.path.join(out_dir, "h_ctg_all.%s.fa" % ctg_id),"w")
+ edges_to_remove = set()
+
+ labelled_node = set()
+ with open(os.path.join(out_dir, "h_ctg_edges.%s" % ctg_id),"w") as f:
+ h_tig_id = 1
+ h_paths = {}
+ #print "number of components:", len([tmp for tmp in nx.weakly_connected_component_subgraphs(sg2)])
+ for sub_hg_0 in nx.weakly_connected_component_subgraphs(sg2):
+ sub_hg = sub_hg_0.copy()
+ while sub_hg.size() > 5:
+ #print "sub_hg size:", len(sub_hg.nodes())
+ sources = [n for n in sub_hg.nodes() if sub_hg.in_degree(n) != 1 ]
+ sinks = [n for n in sub_hg.nodes() if sub_hg.out_degree(n) != 1 ]
+
+
+ #print "number of sources", len(sources), sources
+ #print "number of sinks", len(sinks), sinks
+ if len(sources) == 0 and len(sinks) == 0: #TODO, the rest of the sub-graph are circles, we need to break and print warnning message
+ break
+
+ longest = []
+
+ eliminated_sinks = set()
+ s_longest = {}
+ for s in sources:
+ #print "test source",s, len(eliminated_sinks)
+ if s in labelled_node:
+ continue
+ s_path = []
+ for t in sinks:
+ if t in eliminated_sinks:
+ continue
+ try:
+ path = nx.shortest_path(sub_hg, s, t, weight="score")
+ #print "test path len:", len(path), s, t
+ except nx.exception.NetworkXNoPath:
+ path = []
+ continue
+ s_path.append( [ path, t ] )
+ s_path.sort(key = lambda x: -len(x[0]))
+ if len(s_path) == 0:
+ continue
+ s_longest[s] = s_path[0][0]
+ if len(s_longest[s]) > len(longest):
+ longest = s_longest[s]
+ #print "s longest", longest[0], longest[-1], len(longest)
+ for path, t in s_path[1:]:
+ eliminated_sinks.add(t)
+ #print "elimated t", t
+
+
+ if len(longest) == 0:
+ break
+
+ s = longest[0]
+ t = longest[-1]
+ h_paths[ ( s, t ) ] = longest
+
+ labelled_node.add(s)
+ rs = reverse_end(s)
+ labelled_node.add(rs)
+
+ for v in longest:
+ sub_hg.remove_node(v)
+
+ for s, t in h_paths:
+ longest = h_paths[ (s, t) ]
+ #print "number of node in path", s,t,len(longest)
+ seq = []
+ for v, w in zip(longest[:-1], longest[1:]):
+ sg[v][w]["h_edge"] = 1
+ if sg.edge[v][w]["src"] == "OP":
+ edge_data = p_asm_G.sg_edges[ (v,w) ]
+ else:
+ edge_data = h_asm_G.sg_edges[ (v,w) ]
+ vrid = v.split(":")[0]
+ wrid = w.split(":")[0]
+ vphase = arid_to_phase.get(vrid, (-1,0))
+ wphase = arid_to_phase.get(wrid, (-1,0))
+ print >>f, "%s_%03d" % (ctg_id, h_tig_id), v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
+
+ if sg.edge[v][w]["src"] == "OP":
+ edge_data = p_asm_G.sg_edges[ (v,w) ]
+ else:
+ edge_data = h_asm_G.sg_edges[ (v,w) ]
+
+ seq_id, sp, tp = edge_data[0]
+ if sp < tp:
+ seq.append(seqs[ seq_id ][ sp:tp ])
+ else:
+ seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ sp:tp:-1 ] ]))
+ print >> h_tig_path, "%s_%03d" % (ctg_id, h_tig_id), v, w, seq_id, sp, tp, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
+ sg[v][w]["tig_id"] = "%s_%03d" % (ctg_id, h_tig_id)
+
+ rv, rw = reverse_end(v), reverse_end(w)
+ edges_to_remove.add( (v, w) )
+ edges_to_remove.add( (rw, rv) )
+
+ print >> h_tig_fa, ">%s_%03d" % (ctg_id, h_tig_id)
+ print >> h_tig_fa, "".join(seq)
+ h_tig_id += 1
+
+
+ h_tig_fa.close()
+ h_tig_path.close()
+
+ dump_graph = False # the code segement below is useful for showing the graph
+ if dump_graph == True:
+ for v, w in sg.edges():
+ if "h_edge" not in sg[v][w]:
+ sg[v][w]["h_edge"] = 0
+ if v in reachable_all:
+ sg.node[v]["reachable"] = 1
+ else:
+ sg.node[v]["reachable"] = 0
+ if w in reachable_all:
+ sg.node[w]["reachable"] = 1
+ else:
+ sg.node[w]["reachable"] = 0
+
+ nx.write_gexf(sg, "%s_0.gexf" % ctg_id)
+
+
+
+def parse_args(argv):
+
+ parser = argparse.ArgumentParser(description='layout haplotigs from primary assembly graph and phased aseembly graph')
+
+ parser.add_argument('--fc_asm_path', type=str, help='path to the primary Falcon assembly output directory', required=True)
+ parser.add_argument('--fc_hasm_path', type=str, help='path to the phased Falcon assembly output directory', required=True)
+ parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', default = "all", required=True)
+ parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
+ parser.add_argument('--rid_phase_map', type=str, help="path to the file that encode the relationship of the read id to phase blocks", required=True)
+ parser.add_argument('--fasta', type=str, help="sequence file of the p-reads", required=True)
+ args = parser.parse_args(argv[1:])
+
+ return args
+
+def main(argv=sys.argv):
+
+ # make life easier for now. will refactor it out if possible
+ global all_rid_to_phase
+ global p_asm_G
+ global h_asm_G
+ global all_rid_to_phase
+ global seqs
+
+ args = parse_args(argv)
+ fc_asm_path = args.fc_asm_path
+ fc_hasm_path = args.fc_hasm_path
+ ctg_id = args.ctg_id
+ base_dir = args.base_dir
+ fasta_fn = args.fasta
+
+ p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"),
+ os.path.join(fc_asm_path, "utg_data"),
+ os.path.join(fc_asm_path, "ctg_paths") )
+
+ h_asm_G = AsmGraph( os.path.join(fc_hasm_path, "sg_edges_list"),
+ os.path.join(fc_hasm_path, "utg_data"),
+ os.path.join(fc_hasm_path, "ctg_paths") )
+
+
+ all_rid_to_phase = {}
+
+ all_read_ids = set()
+ with open(args.rid_phase_map) as f:
+ for row in f:
+ row = row.strip().split()
+ all_rid_to_phase.setdefault( row[1], {} )
+ all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3]))
+ all_read_ids.add(row[0])
+
+ for v, w in p_asm_G.sg_edges:
+ if p_asm_G.sg_edges[ (v, w) ][-1] != "G":
+ continue
+ v = v.split(":")[0]
+ w = w.split(":")[0]
+ all_read_ids.add(v)
+ all_read_ids.add(w)
+
+ for v, w in h_asm_G.sg_edges:
+ if h_asm_G.sg_edges[ (v, w) ][-1] != "G":
+ continue
+ v = v.split(":")[0]
+ w = w.split(":")[0]
+ all_read_ids.add(v)
+ all_read_ids.add(w)
+
+ seqs = load_sg_seq(all_read_ids, fasta_fn)
+
+ if ctg_id == "all":
+ ctg_id_list = p_asm_G.ctg_data.keys()
+ else:
+ ctg_id_list = [ctg_id]
+
+ exe_list = []
+ for ctg_id in ctg_id_list:
+ if ctg_id[-1] != "F":
+ continue
+ if ctg_id not in all_rid_to_phase:
+ continue
+ exe_list.append( (ctg_id, os.path.join(".", ctg_id)) )
+
+ exec_pool = Pool(4) #TODO, make this configurable
+ exec_pool.map( generate_haplotigs_for_ctg, exe_list)
+ #map( generate_haplotigs_for_ctg, exe_list)
diff --git a/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py b/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
new file mode 100644
index 0000000..ab92956
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from multiprocessing import Pool
+import subprocess as sp
+import shlex
+import argparse
+import re
+import sys
+
+arid2phase = {}
+
+def filter_stage1(input_):
+ db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len = input_
+ try:
+ ignore_rtn = []
+ current_q_id = None
+ ave_idt = 0.0
+ all_over_len = 0.0
+ overlap_data = {"5p":0, "3p":0}
+ overlap_phase = {"5p":set(), "3p":set()}
+ q_id = None
+
+ for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) ) ).splitlines():
+ l = l.strip().split()
+ q_id, t_id = l[:2]
+
+ if q_id not in arid2phase:
+ continue
+ if t_id not in arid2phase:
+ continue
+ if arid2phase[t_id][0] != arid2phase[q_id][0]:
+ continue
+
+ if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
+ continue
+
+ if q_id != None and q_id != current_q_id:
+
+ left_count = overlap_data["5p"]
+ right_count = overlap_data["3p"]
+
+ if abs(left_count - right_count) > max_diff:
+ ignore_rtn.append( current_q_id )
+ elif left_count > max_ovlp or right_count > max_ovlp:
+ ignore_rtn.append( current_q_id )
+ elif left_count < min_ovlp or right_count < min_ovlp:
+ ignore_rtn.append( current_q_id )
+
+ #remove unphased reads if they are sandwiched by the reads from the same phase
+ if current_q_id not in arid2phase:
+ left_set = overlap_phase["5p"]
+ right_set = overlap_phase["3p"]
+ if len( left_set & right_set ) > 0:
+ ignore_rtn.append( current_q_id )
+
+ overlap_data = {"5p":0, "3p":0}
+ overlap_phase = {"5p":set(), "3p":set()}
+ current_q_id = q_id
+ ave_idt = 0.0
+ all_over_len = 0.0
+
+ overlap_len = -int(l[2])
+ idt = float(l[3])
+ q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+ t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+ if idt < 90:
+ continue
+
+ if q_l < min_len or t_l < min_len:
+ continue
+
+ if l[-1] in ("contains", "overlap"):
+ ave_idt += idt * overlap_len
+ all_over_len += overlap_len
+ if q_s == 0:
+ overlap_data["5p"] += 1
+ if t_id in arid2phase:
+ overlap_phase["5p"].add( arid2phase[t_id] )
+ if q_e == q_l:
+ overlap_data["3p"] += 1
+ if t_id in arid2phase:
+ overlap_phase["3p"].add( arid2phase[t_id] )
+
+ if q_id != None:
+ left_count = overlap_data["5p"]
+ right_count = overlap_data["3p"]
+ if abs(left_count - right_count) > max_diff:
+ ignore_rtn.append( current_q_id )
+ elif left_count > max_ovlp or right_count > max_ovlp:
+ ignore_rtn.append( current_q_id )
+ elif left_count < min_ovlp or right_count < min_ovlp:
+ ignore_rtn.append( current_q_id )
+
+ #remove unphased reads if they are sandwiched by the reads from the same phase
+ if current_q_id not in arid2phase:
+ left_set = overlap_phase["5p"]
+ right_set = overlap_phase["3p"]
+ if len( left_set & right_set ) > 0:
+ ignore_rtn.append( current_q_id )
+
+
+ return fn, ignore_rtn
+
+ except (KeyboardInterrupt, SystemExit):
+ return
+
+def filter_stage2(input_):
+ db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set = input_
+ try:
+ contained_id = set()
+ for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn))).splitlines():
+ l = l.strip().split()
+ q_id, t_id = l[:2]
+
+
+ if q_id not in arid2phase:
+ continue
+ if t_id not in arid2phase:
+ continue
+
+ if arid2phase[t_id][0] != arid2phase[q_id][0]:
+ continue
+ if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
+ continue
+
+
+ q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+ t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+ idt = float(l[3])
+ if idt < 90:
+ continue
+
+ if q_l < min_len or t_l < min_len:
+ continue
+
+ if q_id in ignore_set:
+ continue
+ if t_id in ignore_set:
+ continue
+ if l[-1] == "contained":
+ contained_id.add(q_id)
+ if l[-1] == "contains":
+ contained_id.add(t_id)
+ return fn, contained_id
+
+ except (KeyboardInterrupt, SystemExit):
+ return
+
+def filter_stage3(input_):
+
+ db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn = input_
+
+ overlap_data = {"5p":[], "3p":[]}
+ try:
+ ovlp_output = []
+ current_q_id = None
+ for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) )).splitlines():
+ l = l.strip().split()
+ q_id, t_id = l[:2]
+
+
+ if q_id not in arid2phase:
+ continue
+ if t_id not in arid2phase:
+ continue
+
+ if arid2phase[t_id][0] != arid2phase[q_id][0]:
+ continue
+ if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
+ continue
+
+ if current_q_id == None:
+ current_q_id = q_id
+ overlap_data = {"5p":[], "3p":[]}
+
+ elif q_id != current_q_id:
+ left = overlap_data["5p"]
+ right = overlap_data["3p"]
+ left.sort()
+ right.sort()
+
+ for i in xrange(len(left)):
+ inphase, score, m_range, ovlp = left[i]
+ ovlp_output.append(ovlp)
+ #print " ".join(ovlp), read_end_data[current_q_id]
+ if i >= bestn and m_range > 1000:
+ break
+
+ for i in xrange(len(right)):
+ inphase, score, m_range, ovlp = right[i]
+ ovlp_output.append(ovlp)
+ #print " ".join(ovlp), read_end_data[current_q_id]
+ if i >= bestn and m_range > 1000:
+ break
+
+ overlap_data = {"5p":[], "3p":[]}
+ current_q_id = q_id
+
+ if q_id in contained_set:
+ continue
+ if t_id in contained_set:
+ continue
+ if q_id in ignore_set:
+ continue
+ if t_id in ignore_set:
+ continue
+
+ overlap_len = -int(l[2])
+ idt = float(l[3])
+ q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+ t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+ if idt < 90:
+ continue
+ if q_l < min_len or t_l < min_len:
+ continue
+ if q_s == 0:
+ l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
+ inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
+ #nphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
+ overlap_data["5p"].append( (-inphase, -overlap_len, t_l - (t_e - t_s), l ) )
+ elif q_e == q_l:
+ l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
+ inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
+ #inphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
+ overlap_data["3p"].append( (-inphase, -overlap_len, t_l - (t_e - t_s), l ) )
+
+ left = overlap_data["5p"]
+ right = overlap_data["3p"]
+ left.sort()
+ right.sort()
+
+
+ for i in xrange(len(left)):
+ inphase, score, m_range, ovlp = left[i]
+ ovlp_output.append(ovlp)
+ #print " ".join(ovlp), read_end_data[current_q_id]
+ if i >= bestn and m_range > 1000:
+ break
+
+ for i in xrange(len(right)):
+ inphase, score, m_range, ovlp = right[i]
+ ovlp_output.append(ovlp)
+ #print " ".join(ovlp), read_end_data[current_q_id]
+ if i >= bestn and m_range > 1000:
+ break
+
+ return fn, ovlp_output
+ except (KeyboardInterrupt, SystemExit):
+ return
+
+
+def parse_args(argv):
+
+ parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
+ parser.add_argument('--n_core', type=int, default=4,
+ help='number of processes used for generating consensus')
+ parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+ parser.add_argument('--db', type=str, help='read db file path')
+ parser.add_argument('--max_diff', type=int, help="max difference of 5' and 3' coverage")
+ parser.add_argument('--max_cov', type=int, help="max coverage of 5' or 3' coverage")
+ parser.add_argument('--min_cov', type=int, help="min coverage of 5' or 3' coverage")
+ parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
+ parser.add_argument('--bestn', type=int, default=10, help="output at least best n overlaps on 5' or 3' ends if possible")
+ parser.add_argument('--rid_phase_map', type=str, help="the file that encode the relationship of the read id to phase blocks", required=True)
+ args = parser.parse_args(argv[1:])
+
+ return args
+
+def main(argv=sys.argv):
+ args = parse_args(argv)
+
+ max_diff = args.max_diff
+ max_cov = args.max_cov
+ min_cov = args.min_cov
+ min_len = args.min_len
+ bestn = args.bestn
+ db_fn = args.db
+
+ with open(args.rid_phase_map) as f:
+ for row in f:
+ row = row.strip().split()
+ arid2phase[row[0]] = (row[1], row[2], row[3]) #ctg_id, phase_blk_id, phase_id
+
+ exe_pool = Pool(args.n_core)
+
+ file_list = open(args.fofn).read().split("\n")
+ inputs = []
+ for fn in file_list:
+ if len(fn) != 0:
+ inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len) )
+
+ ignore_all = []
+ for res in exe_pool.imap(filter_stage1, inputs):
+ ignore_all.extend( res[1] )
+
+ inputs = []
+ ignore_all = set(ignore_all)
+ for fn in file_list:
+ if len(fn) != 0:
+ inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all) )
+ contained = set()
+ for res in exe_pool.imap(filter_stage2, inputs):
+ contained.update(res[1])
+ #print res[0], len(res[1]), len(contained)
+
+ #print "all", len(contained)
+ inputs = []
+ ignore_all = set(ignore_all)
+ for fn in file_list:
+ if len(fn) != 0:
+ inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn) )
+ for res in exe_pool.imap(filter_stage3, inputs):
+ for l in res[1]:
+ print " ".join(l)
diff --git a/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py b/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
new file mode 100644
index 0000000..772f011
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
@@ -0,0 +1,1545 @@
+import networkx as nx
+import os
+import shlex
+import sys
+import subprocess
+import argparse
+
+DEBUG_LOG_LEVEL = 0
+
+class SGNode(object):
+ """
+ class representing a node in the string graph
+ """
+ def __init__(self, node_name):
+ self.name = node_name
+ self.out_edges = []
+ self.in_edges = []
+
+ def add_out_edge(self, out_edge):
+ self.out_edges.append(out_edge)
+
+ def del_out_edge(self, out_edge):
+ self.out_edges.remove(out_edge)
+
+ def add_in_edge(self, in_edge):
+ self.in_edges.append(in_edge)
+
+ def del_in_edge(self, in_edge):
+ self.in_edges.remove(in_edge)
+
+class SGEdge(object):
+ """
+ class representing an edge in the string graph
+ """
+ def __init__(self, in_node, out_node):
+ self.in_node = in_node
+ self.out_node = out_node
+ self.attr = {}
+ def set_attribute(self, attr, value):
+ self.attr[attr] = value
+
+def reverse_end( node_id ):
+ node_id, end = node_id.split(":")
+ new_end = "B" if end == "E" else "E"
+ return node_id + ":" + new_end
+
+class StringGraph(object):
+ """
+ class representing the string graph
+ """
+ def __init__(self):
+ self.nodes = {}
+ self.edges = {}
+ self.n_mark = {}
+ self.e_reduce = {}
+ self.repeat_overlap = {}
+
+ def add_node(self, node_name):
+ """
+ add a node into the graph by given a node name
+ """
+ if node_name not in self.nodes:
+ self.nodes[node_name] = SGNode(node_name)
+
+ def add_edge(self, in_node_name, out_node_name, **attributes):
+ """
+ add an edge into the graph by given a pair of nodes
+ """
+ if (in_node_name, out_node_name) not in self.edges:
+
+ self.add_node(in_node_name)
+ self.add_node(out_node_name)
+ in_node = self.nodes[in_node_name]
+ out_node = self.nodes[out_node_name]
+
+ edge = SGEdge(in_node, out_node)
+ self.edges[ (in_node_name, out_node_name) ] = edge
+ in_node.add_out_edge(edge)
+ out_node.add_in_edge(edge)
+ edge = self.edges[ (in_node_name, out_node_name) ]
+ for k, v in attributes.items():
+ edge.attr[k] = v
+
+ def remove_cross_phase_edges(self):
+
+ to_remove = {}
+
+ for k,e in self.edges.items():
+ if e.attr["phase"][0] != e.attr["phase"][1]:
+ to_remove[k] = e
+
+ for k, e in to_remove.items():
+ in_node_name, out_node_name = k
+ in_node = self.nodes[in_node_name]
+ out_node = self.nodes[out_node_name]
+ in_node.del_out_edge(e)
+ out_node.del_in_edge(e)
+ del self.edges[k]
+
+ return to_remove
+
+ def init_reduce_dict(self):
+ for e in self.edges:
+ self.e_reduce[e] = False
+
+ def bfs_nodes(self, n, exclude = None, depth=5):
+ all_nodes = set()
+ all_nodes.add(n)
+ candidate_nodes = set()
+ candidate_nodes.add(n)
+ dp = 1
+ while dp < depth and len(candidate_nodes) > 0:
+ v = candidate_nodes.pop()
+ for e in v.out_edges :
+ w = e.out_node
+ if w == exclude:
+ continue
+ if w not in all_nodes:
+ all_nodes.add(w)
+ if len(w.out_edges) > 0:
+ candidate_nodes.add(w)
+ dp += 1
+
+ return all_nodes
+
+
+ def mark_chimer_edges(self):
+
+ multi_in_nodes = {}
+ multi_out_nodes = {}
+ for n_name in self.nodes:
+ n = self.nodes[n_name]
+ out_nodes = [ e.out_node for e in n.out_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False ]
+ in_nodes = [e.in_node for e in n.in_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False]
+
+ if len(out_nodes) >= 2:
+ multi_out_nodes[n_name] = out_nodes
+ if len(in_nodes) >= 2:
+ multi_in_nodes[n_name] = in_nodes
+
+ chimer_candidates = set()
+ out_set = set()
+ in_set = set()
+ for n_name in multi_out_nodes:
+ out_nodes = set(multi_out_nodes[n_name])
+ out_set |= out_nodes
+
+ for n_name in multi_in_nodes:
+ in_nodes = set(multi_in_nodes[n_name])
+ in_set |= in_nodes
+
+ chimer_candidates = out_set & in_set
+
+ chimer_nodes = []
+ chimer_edges = set()
+ for n in chimer_candidates:
+ out_nodes = set( [ e.out_node for e in n.out_edges ] )
+ test_set = set()
+ for in_node in [e.in_node for e in n.in_edges ]:
+ test_set = test_set | set( [ e.out_node for e in in_node.out_edges ] )
+ test_set -= set([n])
+ if len( out_nodes & test_set ) == 0:
+ flow_node1 = set()
+ flow_node2 = set()
+ for v in list(out_nodes):
+ flow_node1 |= self.bfs_nodes(v, exclude=n)
+ for v in list(test_set):
+ flow_node2 |= self.bfs_nodes(v, exclude=n)
+ if len( flow_node1 & flow_node2 ) == 0:
+ for e in n.out_edges:
+ v, w = e.in_node.name, e.out_node.name
+ if self.e_reduce[ (v, w) ] != True:
+ self.e_reduce[ (v, w) ] = True
+ chimer_edges.add( (v, w) )
+ rv = reverse_end(w)
+ rw = reverse_end(v)
+ self.e_reduce[ (rv, rw) ] = True
+ chimer_edges.add( (rv, rw) )
+
+ for e in n.in_edges:
+ v, w = e.in_node.name, e.out_node.name
+ if self.e_reduce[ (v, w) ] != True:
+ self.e_reduce[ (v, w) ] = True
+ chimer_edges.add( (v, w) )
+ rv = reverse_end(w)
+ rw = reverse_end(v)
+ self.e_reduce[ (rv, rw) ] = True
+ chimer_edges.add( (rv, rw) )
+ chimer_nodes.append( n.name )
+ chimer_nodes.append( reverse_end(n.name) )
+
+ return chimer_nodes, chimer_edges
+
+ def mark_spur_edge(self):
+
+ removed_edges = set()
+ for v in self.nodes:
+ if len(self.nodes[v].out_edges) > 1:
+ for out_edge in self.nodes[v].out_edges:
+ w = out_edge.out_node.name
+
+ if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
+ self.e_reduce[(v, w)] = True
+ removed_edges.add( (v, w) )
+ v2, w2 = reverse_end(w), reverse_end(v)
+ self.e_reduce[(v2, w2)] = True
+ removed_edges.add( (v2, w2) )
+
+ if len(self.nodes[v].in_edges) > 1:
+ for in_edge in self.nodes[v].in_edges:
+ w = in_edge.in_node.name
+ if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
+ self.e_reduce[(w, v)] = True
+ removed_edges.add( (w, v) )
+ v2, w2 = reverse_end(w), reverse_end(v)
+ self.e_reduce[(w2, v2)] = True
+ removed_edges.add( (w2, v2) )
+ return removed_edges
+
+ def mark_tr_edges(self):
+ """
+ transitive reduction
+ """
+ n_mark = self.n_mark
+ e_reduce = self.e_reduce
+ FUZZ = 500
+ for n in self.nodes:
+ n_mark[n] = "vacant"
+
+ for n_name, node in self.nodes.items():
+
+ out_edges = node.out_edges
+ if len(out_edges) == 0:
+ continue
+
+ out_edges.sort(key=lambda x: x.attr["length"])
+
+ for e in out_edges:
+ w = e.out_node
+ n_mark[ w.name ] = "inplay"
+
+ max_len = out_edges[-1].attr["length"]
+
+ max_len += FUZZ
+
+ for e in out_edges:
+ e_len = e.attr["length"]
+ w = e.out_node
+ if n_mark[w.name] == "inplay":
+ w.out_edges.sort( key=lambda x: x.attr["length"] )
+ for e2 in w.out_edges:
+ if e2.attr["length"] + e_len < max_len:
+ x = e2.out_node
+ if n_mark[x.name] == "inplay":
+ n_mark[x.name] = "eliminated"
+
+ for e in out_edges:
+ e_len = e.attr["length"]
+ w = e.out_node
+ w.out_edges.sort( key=lambda x: x.attr["length"] )
+ if len(w.out_edges) > 0:
+ x = w.out_edges[0].out_node
+ if n_mark[x.name] == "inplay":
+ n_mark[x.name] = "eliminated"
+ for e2 in w.out_edges:
+ if e2.attr["length"] < FUZZ:
+ x = e2.out_node
+ if n_mark[x.name] == "inplay":
+ n_mark[x.name] = "eliminated"
+
+ for out_edge in out_edges:
+ v = out_edge.in_node
+ w = out_edge.out_node
+ if n_mark[w.name] == "eliminated":
+ e_reduce[ (v.name, w.name) ] = True
+ v_name, w_name = reverse_end(w.name), reverse_end(v.name)
+ e_reduce[(v_name, w_name)] = True
+ n_mark[w.name] = "vacant"
+
+
+ def mark_best_overlap(self):
+ """
+ find the best overlapped edges
+ """
+
+ best_edges = set()
+ removed_edges = set()
+
+ for v in self.nodes:
+
+ out_edges = self.nodes[v].out_edges
+ if len(out_edges) > 0:
+ out_edges.sort(key=lambda e: -e.attr["score"])
+ for e in out_edges:
+ if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+ best_edges.add( (e.in_node.name, e.out_node.name) )
+ break
+
+ in_edges = self.nodes[v].in_edges
+ if len(in_edges) > 0:
+ in_edges.sort(key=lambda e: -e.attr["score"])
+ for e in in_edges:
+ if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+ best_edges.add( (e.in_node.name, e.out_node.name) )
+ break
+
+ if DEBUG_LOG_LEVEL > 1:
+ print "X", len(best_edges)
+
+ for e_n, e in self.edges.items():
+ v = e_n[0]
+ w = e_n[1]
+ if self.e_reduce[ (v, w) ] != True:
+ if (v, w) not in best_edges:
+ self.e_reduce[(v, w)] = True
+ removed_edges.add( (v, w) )
+ v2, w2 = reverse_end(w), reverse_end(v)
+ self.e_reduce[(v2, w2)] = True
+ removed_edges.add( (v2, w2) )
+
+ return removed_edges
+
+ def resolve_repeat_edges(self):
+
+
+ edges_to_reduce = []
+ nodes_to_test = set()
+ for v_n, v in self.nodes.items():
+
+ out_nodes = []
+ for e in v.out_edges:
+ if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+ out_nodes.append( e.out_node.name )
+
+ in_nodes = []
+ for e in v.in_edges:
+ if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+ in_nodes.append( e.in_node.name )
+
+ if len(out_nodes) == 1 and len(in_nodes) == 1:
+ nodes_to_test.add(v_n)
+
+ for v_n in list( nodes_to_test ):
+
+ v = self.nodes[v_n]
+
+ out_nodes = []
+ for e in v.out_edges:
+ if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+ out_nodes.append( e.out_node.name )
+
+ in_nodes = []
+ for e in v.in_edges:
+ if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+ in_nodes.append( e.in_node.name )
+
+ in_node_name = in_nodes[0]
+
+ for out_edge in self.nodes[in_node_name].out_edges:
+ vv = out_edge.in_node.name
+ ww = out_edge.out_node.name
+
+ ww_out = self.nodes[ww].out_edges
+ v_out = self.nodes[v_n].out_edges
+ ww_out_nodes = set( [ n.out_node.name for n in ww_out] )
+ v_out_nodes = set( [ n.out_node.name for n in v_out] )
+ o_overlap = len( ww_out_nodes & v_out_nodes )
+
+ ww_in_count = 0
+ for e in self.nodes[ww].in_edges:
+ if self.e_reduce[ ( e.in_node.name, e.out_node.name ) ] == False:
+ ww_in_count += 1
+
+ if ww != v_n and\
+ self.e_reduce[ (vv, ww) ] == False and\
+ ww_in_count > 1 and\
+ ww not in nodes_to_test and\
+ o_overlap == 0:
+ edges_to_reduce.append( (vv, ww) )
+
+ out_node_name = out_nodes[0]
+
+ for in_edge in self.nodes[out_node_name].in_edges:
+ vv = in_edge.in_node.name
+ ww = in_edge.out_node.name
+
+ vv_in = self.nodes[vv].in_edges
+ v_in = self.nodes[v_n].in_edges
+ vv_in_nodes = set( [ n.in_node.name for n in vv_in] )
+ v_in_nodes = set( [ n.in_node.name for n in v_in] )
+ i_overlap = len( vv_in_nodes & v_in_nodes )
+
+ vv_out_count = 0
+ for e in self.nodes[vv].out_edges:
+ if self.e_reduce[ ( e.in_node.name, e.out_node.name )] == False:
+ vv_out_count += 1
+
+ if vv != v_n and\
+ self.e_reduce[ (vv, ww) ] == False and\
+ vv_out_count > 1 and\
+ vv not in nodes_to_test and\
+ i_overlap == 0:
+ edges_to_reduce.append( (vv, ww) )
+
+ removed_edges = set()
+ for e in edges_to_reduce:
+ self.e_reduce[e] = True
+ removed_edges.add(e)
+
+ return removed_edges
+
+ def get_out_edges_for_node(self, name, mask=True):
+ rtn = []
+ for e in self.nodes[name].out_edges:
+ v = e.in_node
+ w = e.out_node
+ if self.e_reduce[ (v.name, w.name) ] == False:
+ rtn.append(e)
+ return rtn
+
+
+ def get_in_edges_for_node(self, name, mask=True):
+ rtn = []
+ for e in self.nodes[name].in_edges:
+ v = e.in_node
+ w = e.out_node
+ if self.e_reduce[ (v.name, w.name) ] == False:
+ rtn.append(e)
+ return rtn
+
+ def get_best_out_edge_for_node(self, name, mask=True):
+ rtn = []
+ for e in self.nodes[name].out_edges:
+ v = e.in_node
+ w = e.out_node
+ if self.e_reduce[ (v.name, w.name) ] == False:
+ rtn.append(e)
+ rtn.sort(key=lambda e: e.attr["score"])
+
+ return rtn[-1]
+
+ def get_best_in_edge_for_node(self, name, mask=True):
+ rtn = []
+ for e in self.nodes[name].in_edges:
+ v = e.in_node
+ w = e.out_node
+ if self.e_reduce[ (v.name, w.name) ] == False:
+ rtn.append(e)
+ rtn.sort(key=lambda e: e.attr["score"])
+ return rtn[-1]
+
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def generate_seq_from_path(sg, seqs, path):
+ subseqs = []
+ r_id, end = path[0].split(":")
+
+ count = 0
+ for i in range( len( path ) -1 ):
+ w_n, v_n = path[i:i+2]
+ edge = sg.edges[ (w_n, v_n ) ]
+ read_id, coor = edge.attr["label"].split(":")
+ b,e = coor.split("-")
+ b = int(b)
+ e = int(e)
+ if b < e:
+ subseqs.append( seqs[read_id][b:e] )
+ else:
+ subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
+
+ return "".join(subseqs)
+
+
+def reverse_edge( e ):
+ e1, e2 = e
+ return reverse_end(e2), reverse_end(e1)
+
+def reverse_path( p ):
+ p = p[::-1]
+ return [reverse_end(n) for n in p]
+
+
+def find_bundle(ug, u_edge_data, start_node, depth_cutoff, width_cutoff, length_cutoff):
+
+ tips = set()
+ bundle_edges = set()
+ bundle_nodes = set()
+
+ local_graph = nx.ego_graph(ug, start_node, depth_cutoff, undirected=False)
+ length_to_node = {start_node:0}
+ score_to_node = {start_node:0}
+
+ v = start_node
+ end_node = start_node
+
+ if DEBUG_LOG_LEVEL > 1:
+ print
+ print
+ print "start", start_node
+
+ bundle_nodes.add(v)
+ for vv, ww, kk in local_graph.out_edges(v, keys = True):
+ max_score = 0
+ max_length = 0
+
+ if (vv, ww, kk) not in bundle_edges and\
+ reverse_end(ww) not in bundle_nodes:
+
+ bundle_edges.add( (vv, ww, kk) )
+ tips.add(ww)
+
+ for v in list(tips):
+ bundle_nodes.add(v)
+
+ depth = 1
+ width = 1.0
+ converage = False
+
+
+ while 1:
+ if DEBUG_LOG_LEVEL > 1:
+ print "# of tips", len(tips)
+
+ if len(tips) > 4:
+ converage = False
+ break
+
+ if len(tips) == 1:
+ end_node = tips.pop()
+
+ if DEBUG_LOG_LEVEL > 1:
+ print "end", end_node
+
+ if end_node not in length_to_node:
+ v = end_node
+ max_score_edge = None
+ max_score = 0
+ for uu, vv, kk in local_graph.in_edges(v, keys=True):
+ if uu not in length_to_node:
+ continue
+
+ score = u_edge_data[ (uu, vv, kk) ][1]
+
+ if score > max_score:
+
+ max_score = score
+ max_score_edge = (uu, vv, kk)
+
+ length_to_node[v] = length_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][0]
+ score_to_node[v] = score_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][1]
+
+
+ converage = True
+ break
+
+
+ depth += 1
+ width = 1.0 * len(bundle_edges) / depth
+
+ if depth > 10 and width > width_cutoff:
+ converage = False
+ break
+
+ if depth > depth_cutoff:
+ converage = False
+ break
+
+ tips_list = list(tips)
+
+ tip_updated = False
+ loop_detect = False
+ length_limit_reached = False
+
+ for v in tips_list:
+ if DEBUG_LOG_LEVEL > 1:
+ print "process", v
+
+ if len(local_graph.out_edges(v, keys=True)) == 0: # dead end route
+ print "no out edge", v
+ continue
+
+ max_score_edge = None
+ max_score = 0
+
+ extend_tip = True
+
+ for uu, vv, kk in local_graph.in_edges(v, keys=True):
+ if DEBUG_LOG_LEVEL > 1:
+ print "in_edges", uu, vv, kk
+ print uu, "in length_to_node", uu in length_to_node
+
+ if uu not in length_to_node:
+ extend_tip = False
+ break
+
+ score = u_edge_data[ (uu, vv, kk) ][1]
+
+ if score > max_score:
+
+ max_score = score
+ max_score_edge = (uu, vv, kk)
+
+ if extend_tip:
+
+ length_to_node[v] = length_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][0]
+ score_to_node[v] = score_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][1]
+
+ if length_to_node[v] > length_cutoff:
+ length_limit_reached = True
+ converage = False
+ break
+
+ v_updated = False
+ for vv, ww, kk in local_graph.out_edges(v, keys=True):
+
+ if DEBUG_LOG_LEVEL > 1:
+ print "test", vv, ww, kk
+
+ if ww in length_to_node:
+ loop_detect = True
+ if DEBUG_LOG_LEVEL > 1:
+ print "loop_detect", ww
+ break
+
+ if (vv, ww, kk) not in bundle_edges and\
+ reverse_end(ww) not in bundle_nodes:
+
+ if DEBUG_LOG_LEVEL > 1:
+ print "add", ww
+
+ tips.add(ww)
+ bundle_edges.add( (vv, ww, kk) )
+ tip_updated = True
+ v_updated = True
+
+ if v_updated:
+
+ if DEBUG_LOG_LEVEL > 1:
+ print "remove", v
+
+ tips.remove(v)
+
+ if len(tips) == 1:
+ break
+
+ if loop_detect:
+ converage = False
+ break
+
+ if length_limit_reached:
+ converage = False
+ break
+
+ if loop_detect:
+ converage = False
+ break
+
+ if not tip_updated:
+ converage = False
+ break
+
+ for v in list(tips):
+ bundle_nodes.add(v)
+
+
+
+ data = start_node, end_node, bundle_edges, length_to_node[end_node], score_to_node[end_node], depth
+
+ data_r = None
+
+ if DEBUG_LOG_LEVEL > 1:
+ print converage, data, data_r
+ return converage, data, data_r
+
+def generate_string_graph(args):
+
+ overlap_file = args.overlap_file
+
+ contained_reads = set()
+ chimer_ids = set()
+
+ filter_reads = False
+
+ seqs = set()
+
+ G=nx.Graph()
+ edges =set()
+ overlap_data = []
+ contained_reads = set()
+ overlap_count = {}
+
+
+ # loop through the overlapping data to load the data in the a python array
+ # contained reads are identified
+
+ with open(overlap_file) as f:
+ for l in f:
+ l = l.strip().split()
+
+
+ #work around for some ill formed data recored
+ #if len(l) != 13:
+ # continue
+
+ f_id, g_id, score, identity = l[:4]
+
+ if f_id == g_id: # don't need self-self overlapping
+ continue
+
+ if filter_reads:
+
+ if g_id not in seqs:
+ continue
+
+ if f_id not in seqs:
+ continue
+
+ phase1 = l[13]
+ phase2 = l[14]
+
+ score = int(score)
+ identity = float(identity)
+ contained = l[12]
+ if contained == "contained" and phase1 == phase2:
+ contained_reads.add(f_id)
+ continue
+ if contained == "contains" and phase1 == phase2:
+ contained_reads.add(g_id)
+ continue
+ if contained == "none":
+ continue
+
+ if identity < args.min_idt: # only take record with >96% identity as overlapped reads
+ continue
+ f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
+ g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
+
+ # only used reads longer than the 4kb for assembly
+ if f_len < args.min_len: continue
+ if g_len < args.min_len: continue
+
+ """
+ # double check for proper overlap
+ # this is not necessary when using DALIGNER for overlapper
+ # it may be useful if other overlappers give fuzzier alignment boundary
+ if f_start > 24 and f_len - f_end > 24: # allow 24 base tolerance on both sides of the overlapping
+ continue
+
+ if g_start > 24 and g_len - g_end > 24:
+ continue
+
+ if g_strain == 0:
+ if f_start < 24 and g_len - g_end > 24:
+ continue
+ if g_start < 24 and f_len - f_end > 24:
+ continue
+ else:
+ if f_start < 24 and g_start > 24:
+ continue
+ if g_start < 24 and f_start > 24:
+ continue
+ """
+
+ overlap_data.append( (f_id, g_id, score, identity,
+ f_strain, f_start, f_end, f_len,
+ g_strain, g_start, g_end, g_len,
+ phase1, phase2) )
+
+ overlap_count[f_id] = overlap_count.get(f_id,0)+1
+ overlap_count[g_id] = overlap_count.get(g_id,0)+1
+
+ overlap_set = set()
+ sg = StringGraph()
+ for od in overlap_data:
+ f_id, g_id, score, identity = od[:4]
+ if f_id in contained_reads:
+ continue
+ if g_id in contained_reads:
+ continue
+ f_s, f_b, f_e, f_l = od[4:8]
+ g_s, g_b, g_e, g_l = od[8:12]
+
+ phase1 = od[12]
+ phase2 = od[13]
+
+ overlap_pair = [f_id, g_id]
+ overlap_pair.sort()
+ overlap_pair = tuple( overlap_pair )
+ if overlap_pair in overlap_set: # don't allow duplicated records
+ continue
+ else:
+ overlap_set.add(overlap_pair)
+
+
+ if g_s == 1: # revered alignment, swapping the begin and end coordinates
+ g_b, g_e = g_e, g_b
+
+ # build the string graph edges for each overlap
+ if f_b > 1:
+ if g_b < g_e:
+ """
+ f.B f.E
+ f ----------->
+ g ------------->
+ g.B g.E
+ """
+ if f_b == 0 or g_e - g_l == 0:
+ continue
+ sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
+ length = abs(f_b-0),
+ score = -score,
+ identity = identity,
+ phase = (phase2, phase1))
+ sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = (g_id, g_e, g_l),
+ length = abs(g_e-g_l),
+ score = -score,
+ identity = identity,
+ phase = (phase1, phase2))
+ else:
+ """
+ f.B f.E
+ f ----------->
+ g <-------------
+ g.E g.B
+ """
+ if f_b == 0 or g_e == 0:
+ continue
+ sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
+ length = abs(f_b -0),
+ score = -score,
+ identity = identity,
+ phase = (phase2, phase1))
+ sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = (g_id, g_e, 0),
+ length = abs(g_e- 0),
+ score = -score,
+ identity = identity,
+ phase = (phase1, phase2))
+ else:
+ if g_b < g_e:
+ """
+ f.B f.E
+ f ----------->
+ g ------------->
+ g.B g.E
+ """
+ if g_b == 0 or f_e - f_l == 0:
+ continue
+ sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = (g_id, g_b, 0),
+ length = abs(g_b - 0),
+ score = -score,
+ identity = identity,
+ phase = (phase1, phase2))
+ sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
+ length = abs(f_e-f_l),
+ score = -score,
+ identity = identity,
+ phase = (phase2, phase1))
+ else:
+ """
+ f.B f.E
+ f ----------->
+ g <-------------
+ g.E g.B
+ """
+ if g_b - g_l == 0 or f_e - f_l ==0:
+ continue
+ sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = (g_id, g_b, g_l),
+ length = abs(g_b - g_l),
+ score = -score,
+ identity = identity,
+ phase = (phase1, phase2))
+ sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
+ length = abs(f_e - f_l),
+ score = -score,
+ identity = identity,
+ phase=(phase2, phase1))
+
+
+ cp_edges = sg.remove_cross_phase_edges()
+ sg.init_reduce_dict()
+
+ #if not args.disable_chimer_prediction:
+ # sg.mark_chimer_edges()
+ #sg.mark_spur_edge()
+
+
+ sg.mark_tr_edges() # mark those edges that transitive redundant
+
+ if DEBUG_LOG_LEVEL > 1:
+ print sum( [1 for c in sg.e_reduce.values() if c == True] )
+ print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+ chimer_nodes, chimer_edges = sg.mark_chimer_edges()
+
+ removed_edges = set()
+ if args.lfc == True:
+ removed_edges = sg.resolve_repeat_edges()
+ else:
+ removed_edges = sg.mark_best_overlap() # mark those edges that are best overlap edges
+
+ spur_edges = sg.mark_spur_edge()
+
+ if DEBUG_LOG_LEVEL > 1:
+ print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+ #max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
+ out_f = open("sg_edges_list", "w")
+ nxsg = nx.DiGraph()
+ edge_data = {}
+ for v, w in sg.edges:
+ e = sg.edges[ (v, w) ]
+ rid, sp, tp = e.attr["label"]
+ score = e.attr["score"]
+ identity = e.attr["identity"]
+ length = abs(sp-tp)
+
+
+ if sg.e_reduce[(v, w)] != True:
+ type_ = "G"
+ label = "%s:%d-%d" % (rid, sp, tp)
+ nxsg.add_edge(v, w, label = label, length = length, score = score)
+ edge_data[ (v, w) ] = (rid, sp, tp, length, score, identity, type_)
+ elif (v, w) in chimer_edges:
+ type_ = "C"
+ elif (v, w) in removed_edges:
+ type_ = "R"
+ elif (v, w) in spur_edges:
+ type_ = "S"
+ elif sg.e_reduce[(v, w)] == True:
+ type_ = "TR"
+
+ print >>out_f, v, w, rid, sp, tp, score, identity, type_
+
+ for k,e in cp_edges.items():
+ v = k[0]
+ w = k[1]
+ rid, sp, tp = e.attr["label"]
+ score = e.attr["score"]
+ identity = e.attr["identity"]
+ length = abs(sp-tp)
+ print >>out_f, v, w, rid, sp, tp, score, identity, "CP"
+
+
+ out_f.close()
+ nxsg_r = nxsg.reverse()
+
+ return nxsg, nxsg_r, edge_data
+
+
+
+def construct_compound_paths(ug, u_edge_data):
+
+ source_nodes = set()
+ sink_nodes = set()
+ simple_nodes = set()
+ branch_nodes = set()
+
+ all_nodes = ug.nodes()
+ for n in all_nodes:
+ in_degree = len( ug.in_edges(n) )
+ out_degree = len( ug.out_edges(n) )
+ if in_degree == 0:
+ source_nodes.add(n)
+ if out_degree == 0:
+ sink_nodes.add(n)
+ if in_degree == 1 and out_degree == 1:
+ simple_nodes.add(n)
+ if in_degree > 1 or out_degree > 1:
+ branch_nodes.add(n)
+
+ #print "#", len(all_nodes),len(source_nodes), len(sink_nodes), len(simple_nodes), len(branch_nodes)
+ compound_paths_0 = []
+ for p in list(branch_nodes):
+ if ug.out_degree(p) > 1:
+ coverage, data, data_r = find_bundle(ug, u_edge_data, p, 48, 16, 500000)
+ if coverage == True:
+ start_node, end_node, bundle_edges, length, score, depth = data
+ compound_paths_0.append( (start_node, "NA", end_node, 1.0*len(bundle_edges)/depth, length, score, bundle_edges ) )
+
+ compound_paths_0.sort( key=lambda x: -len(x[6]) )
+
+
+ edge_to_cpath = {}
+ compound_paths_1 = {}
+ for s, v, t, width, length, score, bundle_edges in compound_paths_0:
+ if DEBUG_LOG_LEVEL > 1:
+ print "constructing utg, test ", s,v, t
+
+ overlapped = False
+ for vv, ww, kk in list(bundle_edges):
+ if (vv, ww, kk) in edge_to_cpath:
+ if DEBUG_LOG_LEVEL > 1:
+ print "remove overlapped utg", (s, v, t), (vv, ww, kk)
+ overlapped = True
+ break
+ rvv = reverse_end(vv)
+ rww = reverse_end(ww)
+ rkk = reverse_end(kk)
+ if (rww, rvv, rkk) in edge_to_cpath:
+ if DEBUG_LOG_LEVEL > 1:
+ print "remove overlapped r utg", (s, v, t), (rww, rvv, rkk)
+ overlapped = True
+ break
+
+
+ if not overlapped:
+ if DEBUG_LOG_LEVEL > 1:
+ print "constructing", s,v, t
+
+ bundle_edges_r = []
+ rs = reverse_end(t)
+ rt = reverse_end(s)
+
+ for vv, ww, kk in list(bundle_edges):
+ edge_to_cpath.setdefault( (vv, ww, kk), set() )
+ edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+ rvv = reverse_end(ww)
+ rww = reverse_end(vv)
+ rkk = reverse_end(kk)
+ edge_to_cpath.setdefault( (rvv, rww, rkk), set() )
+ edge_to_cpath[ (rvv, rww, rkk) ].add( (rs, rt, v) ) #assert v == "NA"
+ bundle_edges_r.append( (rvv, rww, rkk) )
+
+ compound_paths_1[ ( s, v, t) ] = width, length, score, bundle_edges
+ compound_paths_1[ ( rs, v, rt) ] = width, length, score, bundle_edges_r
+
+
+ compound_paths_2 = {}
+ edge_to_cpath = {}
+ for s, v, t in compound_paths_1:
+ rs = reverse_end(t)
+ rt = reverse_end(s)
+ if (rs, "NA", rt) not in compound_paths_1:
+ if DEBUG_LOG_LEVEL > 1:
+ print "non_compliment bundle", s, v, t, len(compound_paths_1[( s, v, t)][-1])
+ continue
+ width, length, score, bundle_edges = compound_paths_1[ (s, v, t) ]
+ compound_paths_2[ (s, v, t) ] = width, length, score, bundle_edges
+ for vv, ww, kk in list(bundle_edges):
+ edge_to_cpath.setdefault( (vv, ww, kk), set() )
+ edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+
+
+ compound_paths_3 = {}
+ for k, val in compound_paths_2.items():
+
+ start_node, NA, end_node = k
+ rs = reverse_end(end_node)
+ rt = reverse_end(start_node)
+ assert (rs, "NA", rt) in compound_paths_2
+
+ contained = False
+ for vv, ww, kk in ug.out_edges(start_node, keys=True):
+ if len(edge_to_cpath.get( (vv, ww, kk), [] )) > 1:
+ contained = True
+
+ if not contained:
+ compound_paths_3[k] = val
+ if DEBUG_LOG_LEVEL > 1:
+ print "compound", k
+
+ compound_paths = {}
+ for s, v, t in compound_paths_3:
+ rs = reverse_end(t)
+ rt = reverse_end(s)
+ if (rs, "NA", rt) not in compound_paths_3:
+ continue
+ compound_paths[ (s, v, t) ] = compound_paths_3[ (s, v, t) ]
+
+ return compound_paths
+
+def parse_args(argv):
+
+ parser = argparse.ArgumentParser(description='a string graph assembler that is desinged for handling diploid genomes')
+ parser.add_argument('overlap_file', help='a file that contains the overlap information.')
+ parser.add_argument('--min_len', type=int, default=4000,
+ help='minimum length of the reads to be considered for assembling')
+ parser.add_argument('--min_idt', type=float, default=96,
+ help='minimum alignment identity of the reads to be considered for assembling')
+ parser.add_argument('--lfc', action="store_true", default=False,
+ help='use local flow constraint method rather than best overlap method to resolve knots in string graph')
+ args = parser.parse_args(argv[1:])
+ return args
+
+def main(argv=sys.argv):
+ args = parse_args(argv)
+
+ # transitivity reduction, remove spurs, remove putative edges caused by repeats
+ sg, sg_r, edge_data = generate_string_graph(args)
+
+
+ simple_paths = {}
+ dual_path = {}
+
+
+ sg2 = nx.DiGraph()
+
+ for v, w in edge_data:
+
+ assert (reverse_end(w), reverse_end(v)) in edge_data
+
+ #if (v, w) in masked_edges:
+ # continue
+
+ rid, sp, tp, length, score, identity, type_ = edge_data[ (v, w) ]
+ if type_ != "G":
+ continue
+
+ label = "%s:%d-%d" % (rid, sp, tp)
+ sg2.add_edge( v, w, label = label, length = length, score = score)
+
+
+ # utg construction phase 1, identify all simple paths
+ s_nodes = set()
+ t_nodes = set()
+ simple_nodes = set()
+
+ all_nodes = sg2.nodes()
+ for n in all_nodes:
+ in_degree = len( sg2.in_edges(n) )
+ out_degree = len( sg2.out_edges(n) )
+ if in_degree == 1 and out_degree == 1:
+ simple_nodes.add(n)
+ else:
+ if out_degree != 0:
+ s_nodes.add(n)
+ if in_degree != 0:
+ t_nodes.add(n)
+
+
+ free_edges = set(sg2.edges())
+
+ if DEBUG_LOG_LEVEL > 1:
+ for s in list(simple_nodes):
+ print "simple_node", s
+ for s in list(s_nodes):
+ print "s_node", s
+ for s in list(t_nodes):
+ print "t_node", s
+
+ for v,w in free_edges:
+ if (reverse_end(w), reverse_end(v) ) not in free_edges:
+ print "bug", v,w
+ print oreverse_end(w), reverse_end(v)
+
+ while len(free_edges) != 0:
+ if len(s_nodes) != 0:
+ n = s_nodes.pop()
+ if DEBUG_LOG_LEVEL > 1:
+ print "initial utg 1", n
+ else:
+ e = free_edges.pop()
+ free_edges.add(e)
+ n = e[0]
+ if DEBUG_LOG_LEVEL > 1:
+ print "initial utg 2", n
+
+ path = []
+ path_length =0
+ path_score = 0
+ for v, w in sg2.out_edges(n):
+ if (v, w) not in free_edges:
+ continue
+ rv = reverse_end(v)
+ rw = reverse_end(w)
+
+ path_length = 0
+ path_score = 0
+ v0 = v
+ w0 = w
+ path = [v, w]
+ path_edges = set()
+ path_edges.add( (v, w) )
+ path_length += edge_data[ (v, w) ][3]
+ path_score += edge_data[ (v, w) ][4]
+ free_edges.remove( (v, w) )
+
+ r_path_length = 0
+ r_path_score = 0
+ rv0 = rv
+ rw0 = rw
+ r_path = [rv, rw] # need to reverse again
+ r_path_edges = set()
+ r_path_edges.add( (rw, rv) )
+ r_path_length += edge_data[ (rw, rv) ][3]
+ r_path_score += edge_data[ (rw, rv) ][4]
+ free_edges.remove( (rw, rv) )
+
+ while w in simple_nodes:
+ w, w_ = sg2.out_edges(w)[0]
+ if (w, w_) not in free_edges:
+ break
+ rw_, rw = reverse_end(w_), reverse_end(w)
+
+ if ( rw_, rw ) in path_edges:
+ break
+
+ path.append(w_)
+ path_edges.add( (w, w_) )
+ path_length += edge_data[ (w, w_) ][3]
+ path_score += edge_data[ (w, w_) ][4]
+ free_edges.remove( (w, w_) )
+
+ r_path.append(rw_)
+ r_path_edges.add( (rw_, rw) )
+ r_path_length += edge_data[ (rw_, rw) ][3]
+ r_path_score += edge_data[ (rw_, rw) ][4]
+ free_edges.remove( (rw_, rw) )
+
+
+ w = w_
+
+ simple_paths[ (v0, w0, path[-1]) ] = path_length, path_score, path
+ r_path.reverse()
+ assert r_path[0] == reverse_end(path[-1])
+ simple_paths[ (r_path[0], rw0, rv0) ] = r_path_length, r_path_score, r_path
+
+ if DEBUG_LOG_LEVEL > 1:
+ print path_length, path_score, path
+
+ dual_path[ (r_path[0], rw0, rv0) ] = (v0, w0, path[-1])
+ dual_path[ (v0, w0, path[-1]) ] = (r_path[0], rw0, rv0)
+
+
+
+ ug = nx.MultiDiGraph()
+ u_edge_data = {}
+ circular_path = set()
+
+ for s, v, t in simple_paths:
+ length, score, path = simple_paths[ (s, v, t) ]
+ u_edge_data[ (s, t, v) ] = (length, score, path, "simple")
+ if s != t:
+ ug.add_edge(s, t, key = v, type_ = "simple", via = v, length = length, score = score)
+ else:
+ circular_path.add( (s, t, v) )
+
+
+ if DEBUG_LOG_LEVEL > 1:
+ with open("utg_data0","w") as f:
+ for s, t, v in u_edge_data:
+ rs = reverse_end(t)
+ rt = reverse_end(s)
+ rv = reverse_end(v)
+ assert (rs, rt, rv) in u_edge_data
+ length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+
+ if type_ == "compound":
+ path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+ else:
+ path_or_edges = "~".join( path_or_edges )
+ print >>f, s, v, t, type_, length, score, path_or_edges
+
+ # identify spurs in the utg graph
+ # Currently, we use ad-hoc logic filtering out shorter utg, but we ca
+ # add proper alignment comparison later to remove redundant utgs
+
+ utg_spurs = set()
+ all_nodes = ug.nodes()
+
+ ug2 = ug.copy()
+
+ s_candidates = set( )
+ for v in ug2.nodes():
+ if ug2.in_degree(v) == 0:
+ s_candidates.add(v)
+
+ while len(s_candidates) > 0:
+ n = s_candidates.pop()
+ if ug2.in_degree(n) != 0:
+ continue
+ n_ego_graph = nx.ego_graph( ug2, n, radius = 10 )
+ n_ego_node_set = set( n_ego_graph.nodes() )
+ for b_node in n_ego_graph.nodes():
+ if ug2.in_degree(b_node) <= 1:
+ continue
+
+ with_extern_node = False
+ b_in_nodes = [e[0] for e in ug2.in_edges(b_node)]
+
+ if len(b_in_nodes) == 1:
+ continue
+
+ for v in b_in_nodes:
+ if v not in n_ego_node_set:
+ with_extern_node = True
+ break
+
+ if not with_extern_node:
+ continue
+
+ s_path = nx.shortest_path( ug2, n, b_node )
+ v1 = s_path[0]
+ total_length = 0
+ for v2 in s_path[1:]:
+ for s, t, v in ug2.out_edges(v1, keys=True):
+ if t != v2:
+ continue
+ length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+ total_length += length
+ v1 = v2
+
+ if total_length >= 50000:
+ continue
+
+ v1 = s_path[0]
+ for v2 in s_path[1:]:
+ for s, t, v in ug2.out_edges(v1, keys=True):
+ if t != v2:
+ continue
+ length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+ rs = reverse_end(t)
+ rt = reverse_end(s)
+ rv = reverse_end(v)
+ try:
+ ug2.remove_edge( s, t, key= v)
+ ug2.remove_edge( rs, rt, key= rv)
+ u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
+ u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
+ except Exception:
+ pass
+
+ if ug2.in_edges(v2) == 0:
+ s_candidates.add(v2)
+ v1 = v2
+ break
+
+ #phase 2, finding all "consistent" compound paths
+ compound_paths = construct_compound_paths(ug2, u_edge_data)
+ compound_path_file = open("c_path","w")
+
+ ug2_edges = set(ug2.edges(keys = True))
+ edges_to_remove = set()
+ for s, v, t in compound_paths:
+ width, length, score, bundle_edges = compound_paths[ (s, v, t) ]
+ print >> compound_path_file, s,v,t, width, length, score, "|".join( [e[0]+"~"+e[2]+"~"+e[1] for e in bundle_edges] )
+ for ss, tt, vv in bundle_edges:
+ if (ss, tt, vv) in ug2_edges:
+ edges_to_remove.add( (ss, tt, vv) )
+
+
+ for s, t, v in edges_to_remove:
+ ug2.remove_edge( s, t ,v )
+ length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+ if type_ != "spur":
+ u_edge_data[ (s, t, v) ] = length, score, edges, "contained"
+
+
+ for s, v, t in compound_paths:
+ width, length, score, bundle_edges = compound_paths[ (s, v, t) ]
+ u_edge_data[ (s, t, v) ] = (length, score, bundle_edges, "compound")
+ ug2.add_edge( s, t, key = v, via = v, type_="compound", length = length, score = score)
+
+ assert v == "NA"
+ rs = reverse_end(t)
+ rt = reverse_end(s)
+ assert (rs, v, rt) in compound_paths
+ dual_path[ (s, v, t) ] = (rs, v, rt)
+ dual_path[ (rs, v, rt) ] = (s, v, t)
+
+ compound_path_file.close()
+
+
+ # remove short utg using local flow consistent rule
+ """
+ short UTG like this can be removed, this kind of utg are likely artifects of repeats
+ >____ _____>
+ \__UTG_>__/
+ <____/ \_____<
+ """
+ ug_edge_to_remove = set()
+ for s, t, v in ug.edges(keys=True):
+ if ug2.in_degree(s) == 1 and ug2.out_degree(s) == 2 and \
+ ug2.in_degree(t) == 2 and ug2.out_degree(t) == 1:
+ length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+ if length < 60000:
+ rs = reverse_end(t)
+ rt = reverse_end(s)
+ rv = reverse_end(v)
+ ug_edge_to_remove.add( (s, t, v) )
+ ug_edge_to_remove.add( (rs, rt, rv) )
+ for s, t, v in list(ug_edge_to_remove):
+ ug2.remove_edge(s, t, key=v)
+ length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+ u_edge_data[ (s, t, v) ] = length, score, edges, "repeat_bridge"
+
+ ug = ug2
+
+ with open("utg_data","w") as f:
+ for s, t, v in u_edge_data:
+ length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+
+ if v == "NA":
+ path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+ else:
+ path_or_edges = "~".join( path_or_edges )
+ print >>f, s, v, t, type_, length, score, path_or_edges
+
+ # contig construction from utgs
+
+ s_nodes = set()
+ t_nodes = set()
+ simple_nodes = set()
+ simple_out = set()
+ simple_in = set()
+
+ all_nodes = ug.nodes()
+ for n in all_nodes:
+ in_degree = len( ug.in_edges(n) )
+ out_degree = len( ug.out_edges(n) )
+ if in_degree == 1 and out_degree == 1:
+ simple_nodes.add(n)
+ else:
+ if out_degree != 0:
+ s_nodes.add(n)
+ if in_degree != 0:
+ t_nodes.add(n)
+ if out_degree == 1:
+ simple_out.add(n)
+ if in_degree == 1:
+ simple_in.add(n)
+
+ all_nodes = set(all_nodes)
+ c_path = []
+
+ free_edges = set()
+ for s, t, v in ug.edges(keys=True):
+ free_edges.add( (s, t, v) )
+
+ while len(free_edges) != 0:
+
+ if len(s_nodes) != 0:
+ n = s_nodes.pop()
+ else:
+ e = free_edges.pop()
+ n = e[0]
+
+ for s, t, v in ug.out_edges(n, keys=True):
+ path_start = n
+ path_end = None
+ path_key = None
+ path = []
+ path_length = 0
+ path_score = 0
+ path_nodes = set()
+ path_nodes.add(s)
+ if DEBUG_LOG_LEVEL > 1:
+ print "check 1", s, t, v
+ path_key = t
+ t0 = s
+ while t in simple_out:
+ if t in path_nodes:
+ break
+ rt = reverse_end(t)
+ if rt in path_nodes:
+ break
+
+ path.append( (t0, t, v) )
+ path_nodes.add(t)
+ length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+ path_length += length
+ path_score += score
+ assert len( ug.out_edges( t, keys=True ) ) == 1 # t is "simple_out" node
+ t0, t, v = ug.out_edges( t, keys=True )[0]
+
+ path.append( (t0, t, v) )
+ length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+ path_length += length
+ path_score += score
+ path_nodes.add(t)
+ path_end = t
+
+ c_path.append( (path_start, path_key, path_end, path_length, path_score, path, len(path)) )
+ if DEBUG_LOG_LEVEL > 1:
+ print "c_path", path_start, path_key, path_end, path_length, path_score, len(path)
+ for e in path:
+ if e in free_edges:
+ free_edges.remove( e )
+
+ if DEBUG_LOG_LEVEL > 1:
+ print "left over edges:", len(free_edges)
+
+
+
+ free_edges = set()
+ for s, t, v in ug.edges(keys=True):
+ free_edges.add( (s, t, v) )
+
+
+ ctg_id = 0
+
+ ctg_paths = open("ctg_paths","w")
+
+ c_path.sort( key=lambda x: -x[3] )
+
+
+ for path_start, path_key, path_end, p_len, p_score, path, n_edges in c_path:
+ length = 0
+ score = 0
+ length_r = 0
+ score_r = 0
+
+ non_overlapped_path = []
+ non_overlapped_path_r = []
+ for s, t, v in path:
+ if v != "NA":
+ rs, rt, rv = reverse_end(t), reverse_end(s), reverse_end(v)
+ else:
+ rs, rt, rv = reverse_end(t), reverse_end(s), "NA"
+ if (s, t, v) in free_edges and (rs, rt, rv) in free_edges:
+ non_overlapped_path.append( (s,t,v) )
+ non_overlapped_path_r.append( (rs, rt, rv) )
+ length += u_edge_data[ (s, t, v) ][0]
+ score += u_edge_data[ (s, t, v) ][1]
+ length_r += u_edge_data[ (rs, rt, rv) ][0]
+ score_r += u_edge_data[ (rs, rt, rv) ][1]
+ else:
+ break
+
+ if len(non_overlapped_path) == 0:
+ continue
+ s0, t0, v0 = non_overlapped_path[0]
+ end_node = non_overlapped_path[-1][1]
+
+ print >> ctg_paths, "%06dF" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length, score, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path ] )
+ non_overlapped_path_r.reverse()
+ s0, t0, v0 = non_overlapped_path_r[0]
+ end_node = non_overlapped_path_r[-1][1]
+ print >> ctg_paths, "%06dR" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length_r, score_r, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path_r ] )
+ ctg_id += 1
+ for e in non_overlapped_path:
+ if e in free_edges:
+ free_edges.remove(e)
+ for e in non_overlapped_path_r:
+ if e in free_edges:
+ free_edges.remove(e)
+
+
+
+ for s, t, v in list(circular_path):
+ length, score, path, type_ = u_edge_data[ (s, t, v) ]
+ print >> ctg_paths, "%6d" % ctg_id, "ctg_circular", s+"~"+v+"~"+t, t, length, score, s+"~"+v+"~"+t
+ ctg_id += 1
+
+ ctg_paths.close()
diff --git a/FALCON_unzip/falcon_unzip/phasing.py b/FALCON_unzip/falcon_unzip/phasing.py
new file mode 100644
index 0000000..5592869
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/phasing.py
@@ -0,0 +1,569 @@
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+ makePypeLocalFile, fn, PypeTask)
+from falcon_kit.FastaReader import FastaReader
+import argparse
+import logging
+import os
+import re
+import shlex
+import subprocess
+import sys
+
+cigar_re = r"(\d+)([MIDNSHP=X])"
+
+def make_het_call(self):
+
+ bam_fn = fn(self.bam_file)
+ ctg_id = self.parameters["ctg_id"]
+ ref_seq = self.parameters["ref_seq"]
+ base_dir = self.parameters["base_dir"]
+ vmap_fn = fn(self.vmap_file)
+ vpos_fn = fn(self.vpos_file)
+ q_id_map_fn = fn(self.q_id_map_file)
+
+ p = subprocess.Popen(shlex.split("samtools view %s %s" % (bam_fn, ctg_id) ), stdout=subprocess.PIPE)
+ pileup = {}
+ q_id_map = {}
+ q_max_id = 0
+ q_id = 0
+ q_name_to_id = {}
+
+ try:
+ os.makedirs("%s/%s" % (base_dir, ctg_id))
+ except OSError:
+ pass
+
+ vmap = open(vmap_fn, "w")
+ vpos = open(vpos_fn, "w")
+
+ for l in p.stdout:
+ l = l.strip().split()
+ if l[0][0] == "@":
+ continue
+
+ QNAME = l[0]
+ if QNAME not in q_name_to_id:
+ q_id = q_max_id
+ q_name_to_id[QNAME] = q_id
+ q_max_id += 1
+
+ q_id = q_name_to_id[QNAME]
+ q_id_map[q_id] = QNAME
+ FLAG = int(l[1])
+ RNAME = l[2]
+ POS = int(l[3]) - 1 # convert to zero base
+ CIGAR = l[5]
+ SEQ = l[9]
+ rp = POS
+ qp = 0
+
+ skip_base = 0
+ total_aln_pos = 0
+ for m in re.finditer(cigar_re, CIGAR):
+ adv = int(m.group(1))
+ total_aln_pos += adv
+
+ if m.group(2) == "S":
+ skip_base += adv
+
+ if 1.0 - 1.0 * skip_base / total_aln_pos < 0.1:
+ continue
+ if total_aln_pos < 2000:
+ continue
+
+ for m in re.finditer(cigar_re, CIGAR):
+ adv = int(m.group(1))
+ if m.group(2) == "S":
+ qp += adv
+ if m.group(2) == "M":
+ matches = []
+ for i in range(adv):
+ matches.append( (rp, SEQ[qp]) )
+ rp += 1
+ qp += 1
+ matches = matches[1:-1]
+ for pos, b in matches:
+ pileup.setdefault(pos, {})
+ pileup[pos].setdefault(b, [])
+ pileup[pos][b].append(q_id)
+ elif m.group(2) == "I":
+ for i in range(adv):
+ qp += 1
+ elif m.group(2) == "D":
+ for i in range(adv):
+ rp += 1
+
+ pos_k = pileup.keys()
+ pos_k.sort()
+ th = 0.25
+ for pos in pos_k:
+ if pos < POS:
+ if len(pileup[pos]) < 2:
+ del pileup[pos]
+ continue
+ base_count = []
+ total_count = 0
+ for b in ["A", "C", "G", "T"]:
+ count = len(pileup[pos].get(b,[]))
+ base_count.append( (count, b) )
+ total_count += count
+ if total_count < 10:
+ del pileup[pos]
+ continue
+
+ base_count.sort()
+ base_count.reverse()
+ p0 = 1.0 * base_count[0][0] / total_count
+ p1 = 1.0 * base_count[1][0] / total_count
+ if p0 < 1.0 - th and p1 > th:
+ b0 = base_count[0][1]
+ b1 = base_count[1][1]
+ ref_base = ref_seq[pos]
+ print >> vpos, pos+1, ref_base, total_count, " ".join(["%s %d" % (x[1], x[0]) for x in base_count])
+ for q_id_ in pileup[pos][b0]:
+ print >> vmap, pos+1, ref_base, b0, q_id_
+ for q_id_ in pileup[pos][b1]:
+ print >> vmap, pos+1, ref_base, b1, q_id_
+ del pileup[pos]
+
+
+ q_id_map_f = open(q_id_map_fn, "w")
+ for q_id, q_name in q_id_map.items():
+ print >> q_id_map_f, q_id, q_name
+
+
+def generate_association_table(self):
+
+ vmap_fn = fn(self.vmap_file)
+ atable_fn = fn(self.atable_file)
+ ctg_id = self.parameters["ctg_id"]
+ base_dir = self.parameters["base_dir"]
+
+ vmap = {}
+ v_positions = []
+
+ with open(vmap_fn) as f:
+ for l in f:
+ l = l.strip().split()
+ pos = int(l[0])
+ ref_b = l[1]
+ v_b = l[2]
+ q_id = int(l[3])
+ if (pos, ref_b) not in vmap:
+ v_positions.append( (pos, ref_b) )
+ vmap.setdefault( (pos, ref_b), {} )
+ vmap[ (pos, ref_b) ].setdefault(v_b, [])
+ vmap[ (pos, ref_b) ][v_b].append( q_id )
+
+
+ #xary = []
+ #yary = []
+ with open(atable_fn, "w") as out_f:
+ for i1 in xrange(len(v_positions)):
+ link_count = 0
+ for i2 in xrange(i1+1, len(v_positions)):
+ pos1, rb1 = v_positions[i1]
+ pos2, rb2 = v_positions[i2]
+ if pos2 - pos1 > (1 << 16):
+ continue
+ ct = {}
+ p1table = []
+ p2table = []
+ s1 = 0
+ list1 = vmap[ (pos1, rb1) ].items()
+ for b1, qids1 in list1:
+ p1table.append( (b1, len(qids1) ) )
+ s1 += len(qids1)
+
+ s2 = 0
+ list2 = vmap[ (pos2, rb2) ].items()
+ for b2, qids2 in list2:
+ p2table.append( (b2, len(qids2) ) )
+ s2 += len(qids2)
+
+ total_s = 0
+ for b1, qids1 in list1:
+ for b2, qids2 in list2:
+ s = len(set(qids1) & set(qids2))
+ ct[(b1,b2)] = s
+ total_s += s
+ if total_s < 6:
+ continue
+
+ b11 = p1table[0][0]
+ b12 = p1table[1][0]
+ b21 = p2table[0][0]
+ b22 = p2table[1][0]
+ print >> out_f, pos1, b11, b12, pos2, b21, b22, ct[(b11,b21)], ct[(b11,b22)], ct[(b12,b21)], ct[(b12,b22)]
+
+
+ #xary.append(pos1)
+ #yary.append(pos2)
+ link_count += 1
+ if link_count > 500:
+ break
+
+def get_score( c_score, pos1, pos2, s1, s2 ):
+ if pos1 > pos2:
+ pos1, pos2 = pos2, pos1
+ s1, s2 = s2, s1
+ b11, b12 = s1
+ b21, b22 = s2
+ return c_score[ (pos1, pos2) ][ (b11+b21, b12+b22) ]
+
+def get_phased_blocks(self):
+ vmap_fn = fn(self.vmap_file)
+ atable_fn = fn(self.atable_file)
+ p_variant_fn = fn(self.phased_variant_file)
+
+ left_connect = {}
+ right_connect = {}
+
+ c_score = {}
+ states = {}
+ positions = set()
+
+
+
+ ref_base = {}
+ with open(vmap_fn) as f:
+ for l in f:
+ l = l.strip().split()
+ pos = int(l[0])
+ ref_b = l[1]
+ v_b = l[2]
+ q_id = int(l[3])
+ ref_base[pos] = ref_b
+
+ with open(atable_fn) as f:
+ for l in f:
+ l = l.strip().split()
+ pos1, b11, b12, pos2, b21, b22, s11, s12, s21, s22 = l
+ s11, s12, s21, s22 = int(s11), int(s12), int(s21), int(s22)
+ if abs(s11+s22-s12-s21) < 6:
+ continue
+ pos1 = int(pos1)
+ pos2 = int(pos2)
+ positions.add(pos1)
+ positions.add(pos2)
+ right_connect.setdefault(pos1, [])
+ right_connect[pos1].append(pos2)
+ left_connect.setdefault(pos2, [])
+ left_connect[pos2].append(pos1)
+ c_score[ (pos1, pos2) ] = { (b11+b21, b12+b22): s11 + s22, (b12+b22, b11+b21): s11 + s22,
+ (b12+b21, b11+b22): s12 + s21, (b11+b22, b12+b21): s12 + s21 }
+
+
+ if pos1 not in states:
+ st1 = (b11, b12)
+ st2 = (b12, b11)
+ score1 = 0
+ score2 = 0
+ for pp in left_connect.get(pos1,[]):
+ if pp in states:
+ st0 = states[pp]
+ else:
+ continue
+ score1 += get_score( c_score, pp, pos1, st0, st1 )
+ score2 += get_score( c_score, pp, pos1, st0, st2 )
+
+ for pp in right_connect.get(pos1,[]):
+ if pp in states:
+ st0 = states[pp]
+ else:
+ continue
+ score1 += get_score( c_score, pos1, pp, st1, st0 )
+ score2 += get_score( c_score, pos1, pp, st2, st0 )
+
+ if score1 >= score2:
+ states[pos1] = st1
+ else:
+ states[pos1] = st2
+
+ if pos2 not in states:
+ st1 = (b21, b22)
+ st2 = (b22, b21)
+ score1 = 0
+ score2 = 0
+ for pp in left_connect.get(pos2,[]):
+ if pp in states:
+ st0 = states[pp]
+ else:
+ continue
+ score1 += get_score( c_score, pp, pos2, st0, st1 )
+ score2 += get_score( c_score, pp, pos2, st0, st2 )
+
+ for pp in right_connect.get(pos2,[]):
+ if pp in states:
+ st0 = states[pp]
+ else:
+ continue
+ score1 += get_score( c_score, pos2, pp, st1, st0 )
+ score2 += get_score( c_score, pos2, pp, st2, st0 )
+
+ if score1 >= score2:
+ states[pos2] = st1
+ else:
+ states[pos2] = st2
+
+ positions = list(positions)
+ positions.sort()
+
+
+ iter_count = 0
+ while 1:
+ iter_count += 1
+ if iter_count > 10:
+ break
+ update_count = 0
+ for p in positions:
+ b1, b2 = states[p]
+ st1 = (b1, b2)
+ st2 = (b2, b1)
+
+ score1 = 0
+ score2 = 0
+ for pp in left_connect.get(p,[]):
+ st0 = states[pp]
+ score1 += get_score( c_score, pp, p, st0 ,st1)
+ score2 += get_score( c_score, pp, p, st0, st2)
+
+ #for pp in right_connect.get(p,[]):
+ # st0 = states[pp]
+ # score1 += get_score( c_score, p, pp, st1 ,st0)
+ # score2 += get_score( c_score, p, pp, st2, st0)
+
+ if score1 >= score2:
+ states[p] = st1
+ else:
+ states[p] = st2
+ update_count += 1
+ if update_count == 0:
+ break
+
+
+ right_extent = {}
+ right_score = {}
+ left_extent = {}
+ left_score = {}
+
+
+ for p in positions:
+
+ left_extent[p] = p
+ left_score[p] = 0
+ if p in left_connect:
+ left = p
+ st0 = states[p]
+ st0_ = st0[1], st0[0]
+ for pp in left_connect[p]:
+ st1 = states[pp]
+ s = get_score( c_score, pp, p, st1, st0)
+ s_ = get_score( c_score, pp, p, st1, st0_)
+ left_score[p] += s - s_
+ if s - s_ > 0 and pp < left:
+ left = pp
+ left_extent[p] = left
+
+ right_extent[p] = p
+ right_score[p] = 0
+ if p in right_connect:
+ right = p
+ st0 = states[p]
+ st0_ = st0[1], st0[0]
+ for pp in right_connect[p]:
+ st1 = states[pp]
+ s = get_score( c_score, p, pp, st0, st1)
+ s_ = get_score( c_score, p, pp, st0_, st1)
+ right_score[p] += s - s_
+ if s - s_ > 0 and pp > right:
+ right = pp
+ right_extent[p] = right
+
+
+
+
+ phase_block_id = 1
+ phase_blocks = {}
+ pb = []
+
+ max_right_ext = 0
+ for p in positions:
+ if right_score[p] < 10 or left_score[p] < 10:
+ continue
+ b1, b2 = states[p]
+ if max_right_ext < left_extent[p]:
+ if len(pb) > 3:
+ phase_blocks[phase_block_id] = pb
+ phase_block_id += 1
+ pb = []
+ pb.append( (p, b1, b2) )
+ if right_extent[p] > max_right_ext:
+ max_right_ext = right_extent[p]
+ if len(pb) > 3:
+ phase_blocks[phase_block_id] = pb
+ else:
+ phase_block_id -= 1
+
+
+ with open(p_variant_fn, "w") as out_f:
+ for pid in xrange(1, phase_block_id+1):
+ if len(phase_blocks[pid]) == 0:
+ continue
+ min_ = min( [x[0] for x in phase_blocks[pid]] )
+ max_ = max( [x[0] for x in phase_blocks[pid]] )
+
+ print >>out_f, "P", pid, min_, max_, max_ - min_, len(phase_blocks[pid]), 1.0 * (max_-min_)/len(phase_blocks[pid])
+ for p, b1, b2 in phase_blocks[pid]:
+ rb = ref_base[p]
+ print >>out_f, "V", pid, p, "%d_%s_%s" % (p,rb,b1), "%d_%s_%s" % (p,rb,b2), left_extent[p], right_extent[p], left_score[p], right_score[p]
+
+def get_phased_reads(self):
+
+ q_id_map_fn = fn(self.q_id_map_file)
+ vmap_fn = fn(self.vmap_file)
+ p_variant_fn = fn(self.phased_variant_file)
+ parameters = self.parameters
+
+ ctg_id = parameters["ctg_id"]
+
+ phased_read_fn = fn(self.phased_read_file)
+
+ rid_map = {}
+ with open(q_id_map_fn) as f:
+ for l in f:
+ l = l.strip().split()
+ rid_map[int(l[0])] = l[1]
+
+
+ read_to_variants = {}
+ variant_to_reads = {}
+ with open(vmap_fn) as f:
+ for l in f:
+ l = l.strip().split()
+ variant = "_".join(l[:3])
+ read_id = int(l[3])
+ read_to_variants.setdefault(read_id, set())
+ read_to_variants[read_id].add(variant)
+ variant_to_reads.setdefault(variant, set())
+ variant_to_reads[variant].add(read_id)
+
+
+ variant_to_phase = {}
+ with open(p_variant_fn) as f:
+ for l in f:
+ """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781"""
+ l = l.strip().split()
+ if l[0] != "V":
+ continue
+ pb_id = int(l[1])
+ variant_to_phase[ l[3] ] = (pb_id, 0)
+ variant_to_phase[ l[4] ] = (pb_id, 1)
+
+ with open(phased_read_fn, "w") as out_f:
+ for r in read_to_variants:
+ vl = {}
+ pl = set()
+ for v in list( read_to_variants[r] ):
+ if v in variant_to_phase:
+ p = variant_to_phase[v]
+ vl[ p ] = vl.get(p, 0) + 1
+ pl.add(p[0])
+ pl = list(pl)
+ pl.sort()
+ for p in pl:
+ if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1:
+ print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
+ elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1:
+ print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
+
+def phasing(args):
+ bam_fn = args.bam
+ fasta_fn = args.fasta
+ ctg_id = args.ctg_id
+ base_dir = args.base_dir
+
+ ref_seq = ""
+ for r in FastaReader(fasta_fn):
+ rid = r.name.split()[0]
+ if rid != ctg_id:
+ continue
+ ref_seq = r.sequence.upper()
+
+ wf = PypeProcWatcherWorkflow(
+ max_jobs=1,
+ )
+
+ bam_file = makePypeLocalFile(bam_fn)
+ vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
+ vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
+ q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
+ parameters = {}
+ parameters["ctg_id"] = ctg_id
+ parameters["ref_seq"] = ref_seq
+ parameters["base_dir"] = base_dir
+
+ make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
+ outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
+ parameters = parameters,
+ ) (make_het_call)
+
+ wf.addTasks([make_het_call_task])
+
+
+
+
+ atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
+ parameters = {}
+ parameters["ctg_id"] = ctg_id
+ parameters["base_dir"] = base_dir
+ generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
+ outputs = { "atable_file": atable_file },
+ ) (generate_association_table)
+
+ wf.addTasks([generate_association_table_task])
+
+
+
+
+ phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
+ get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
+ outputs = { "phased_variant_file": phased_variant_file },
+ ) (get_phased_blocks)
+ wf.addTasks([get_phased_blocks_task])
+
+
+
+
+ phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
+ get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
+ "q_id_map_file": q_id_map_file,
+ "phased_variant_file": phased_variant_file },
+ outputs = { "phased_read_file": phased_read_file },
+ parameters = {"ctg_id": ctg_id},
+ ) (get_phased_reads)
+ wf.addTasks([get_phased_reads_task])
+
+
+ wf.refreshTargets()
+ #with open("fc_phasing_wf.dot", "w") as f:
+ # print >>f, wf.graphvizDot
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(description='phasing variants and reads from a bam file')
+ # we can run this in parallel mode in the furture
+ #parser.add_argument('--n_core', type=int, default=4,
+ # help='number of processes used for generating consensus')
+ parser.add_argument('--bam', type=str, help='path to sorted bam file', required=True)
+ parser.add_argument('--fasta', type=str, help='path to the fasta file of contain the contig', required=True)
+ parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
+ parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
+
+
+ args = parser.parse_args(argv[1:])
+ return args
+
+def main(argv=sys.argv):
+ logging.basicConfig()
+ args = parse_args(argv)
+ phasing(args)
diff --git a/FALCON_unzip/falcon_unzip/phasing_readmap.py b/FALCON_unzip/falcon_unzip/phasing_readmap.py
new file mode 100644
index 0000000..2e30c4b
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/phasing_readmap.py
@@ -0,0 +1,73 @@
+import argparse
+import logging
+import os
+import re
+import sys
+
+
+def get_phasing_readmap(args):
+
+ phased_reads = args.phased_reads
+ read_map_dir = args.read_map_dir
+ the_ctg_id = args.ctg_id
+ base_dir = args.base_dir
+
+ rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids')
+ pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')
+ rid_to_oid = open(rawread_id_file).read().split('\n') #daligner raw read id to the original ids
+ pid_to_fid = open(pread_id_file).read().split('\n') #daligner pread id to the fake ids
+
+ def pid_to_oid(pid):
+ fid = pid_to_fid[int(pid)]
+ rid = int(fid.split('/')[1])/10
+ return rid_to_oid[int(rid)]
+
+ rid_to_oid = open(rawread_id_file).read().split('\n') #daligner raw read id to the original ids
+ pid_to_fid = open(pread_id_file).read().split('\n') #daligner pread id to the fake ids
+
+
+ rid_to_phase = {}
+ with open(phased_reads) as f:
+ for row in f:
+ row = row.strip().split()
+ rid_to_phase[row[6]] = (int(row[2]), int(row[3]))
+
+ arid_to_phase = {}
+ map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
+ with open(map_fn) as f:
+ for row in f:
+ row = row.strip().split()
+ ctg_id = row[1]
+ if not ctg_id.startswith(the_ctg_id):
+ continue
+ if int(row[3]) != 0: #not the best hit
+ continue
+ o_id = pid_to_oid(row[0])
+ phase = rid_to_phase.get( o_id, (-1, 0) )
+ arid_to_phase['%09d' % int(row[0])] = phase
+
+ with open(os.path.join(base_dir, 'rid_to_phase.%s' % the_ctg_id),'w') as f:
+ for arid, phase in arid_to_phase.items():
+ print >>f, arid, the_ctg_id, phase[0], phase[1]
+
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(description='mapping internal daligner read id to phase block and phase',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ # we can run this in parallel mode in the furture
+ #parser.add_argument('--n_core', type=int, default=4,
+ # help='number of processes used for generating consensus')
+ parser.add_argument('--phased_reads', type=str, help='path to read vs. phase map', required=True)
+ parser.add_argument('--read_map_dir', type=str, help='path to the read map directory', required=True)
+ parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
+ parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
+
+ args = parser.parse_args(argv[1:])
+ return args
+
+
+def main(argv=sys.argv):
+ logging.basicConfig()
+ args = parse_args(argv)
+ get_phasing_readmap(args)
diff --git a/FALCON_unzip/falcon_unzip/rr_hctg_track.py b/FALCON_unzip/falcon_unzip/rr_hctg_track.py
new file mode 100644
index 0000000..fa02434
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/rr_hctg_track.py
@@ -0,0 +1,201 @@
+from __future__ import print_function
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import sys
+import glob
+import os
+from heapq import heappush, heappop, heappushpop
+
+Reader = io.CapturedProcessReaderContext
+
+
+def get_rid_to_ctg(fn):
+ rid_to_ctg = {}
+ with open(fn) as f:
+ for row in f:
+ row = row.strip().split()
+ pid, rid, oid, ctg = row
+ rid_to_ctg.setdefault( rid, set() )
+ rid_to_ctg[ rid ].add(ctg)
+ return rid_to_ctg
+
+def run_tr_stage1(db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase):
+ cmd = 'LA4Falcon -m %s %s' % (db_fn, fn)
+ reader = Reader(cmd)
+ with reader:
+ return fn, tr_stage1(reader.readlines, min_len, bestn, rid_to_ctg, rid_to_phase)
+
+def tr_stage1(readlines, min_len, bestn, rid_to_ctg, rid_to_phase):
+ """
+ for each read in the b-read column inside the LAS files, we
+ keep top `bestn` hits with a priority queue through all overlaps
+ """
+
+ rtn = {}
+ for l in readlines():
+ l = l.strip().split()
+ q_id, t_id = l[:2]
+ overlap_len = -int(l[2])
+ idt = float(l[3])
+ q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+ t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+ if t_l < min_len:
+ continue
+ if q_id not in rid_to_ctg:
+ continue
+
+ t_phase = rid_to_phase[ int(t_id) ]
+ if t_phase != None:
+ ctg_id, block, phase = t_phase
+ if block != -1:
+ q_phase = rid_to_phase[ int(q_id) ]
+ if q_phase != None:
+ if q_phase[0] == ctg_id and q_phase[1] == block and q_phase[2] != phase:
+ continue
+
+ rtn.setdefault(t_id, [])
+ if len(rtn[t_id]) < bestn:
+ heappush(rtn[t_id], (overlap_len, q_id) )
+ else:
+ heappushpop(rtn[t_id], (overlap_len, q_id) )
+
+ return rtn
+
+def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
+ io.LOG('preparing tr_stage1')
+ io.logstats()
+ asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
+ hasm_dir = os.path.abspath(os.path.join(base_dir, '3-unzip'))
+ quiver_dir = os.path.abspath(os.path.join(base_dir, '4-quiver'))
+ rid_to_ctg = get_rid_to_ctg(os.path.join(quiver_dir, 'read_maps', 'read_to_contig_map' ))
+
+ phased_read_file = os.path.join(hasm_dir, 'all_phased_reads')
+ oid_to_phase = {}
+ with open(phased_read_file) as f:
+ for row in f:
+ row = row.strip().split()
+ ctg_id, block, phase = row[1:4]
+ oid = row[6]
+ block = int(block)
+ phase = int(phase)
+ oid_to_phase[ oid ] = (ctg_id, block, phase)
+ rid_to_phase = {}
+ rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
+ rid_to_oid = open(rawread_ids_fn).read().split('\n')
+ rid_to_phase = [ None ] * len( rid_to_oid )
+ for rid, oid in enumerate(rid_to_oid):
+ rid_to_phase[rid] = oid_to_phase.get( oid, None )
+
+
+ inputs = []
+ for fn in file_list:
+ inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) )
+ """
+ Aggregate hits from each individual LAS and keep the best n hit.
+ Note that this does not guarantee that the final results is globally the best n hits espcially
+ when the number of `bestn` is too small. In those case, if there is more hits from single LAS
+ file, then we will miss some good hits.
+ """
+ bread_to_areads = {}
+ for fn, res in exe_pool.imap(io.run_func, inputs):
+ for k in res:
+ bread_to_areads.setdefault(k, [])
+ for item in res[k]:
+ if len(bread_to_areads[k]) < bestn:
+ heappush( bread_to_areads[k], item )
+ else:
+ heappushpop( bread_to_areads[k], item )
+
+ #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n')
+
+ """
+ For each b-read, we find the best contig map throgh the b->a->contig map.
+ """
+ with open(os.path.join(quiver_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f:
+ for bread in bread_to_areads:
+
+ ctg_score = {}
+ for s, rid in bread_to_areads[bread]:
+ if rid not in rid_to_ctg: continue
+
+ ctgs = rid_to_ctg[rid]
+ for ctg in ctgs:
+ ctg_score.setdefault(ctg, [0,0])
+ ctg_score[ctg][0] += -s
+ ctg_score[ctg][1] += 1
+
+ #oid = rid_to_oid[int(bread)]
+ ctg_score = ctg_score.items()
+ ctg_score.sort( key = lambda k: k[1][0] )
+ rank = 0
+
+ for ctg, score_count in ctg_score:
+ if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
+ in_ctg = 1
+ else:
+ in_ctg = 0
+ score, count = score_count
+ #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
+ print(bread, ctg, count, rank, score, in_ctg, file=out_f)
+ rank += 1
+
+
+
+def try_run_track_reads(n_core, base_dir, min_len, bestn):
+ io.LOG('starting track_reads')
+
+ rawread_dir = os.path.abspath(os.path.join(base_dir, '0-rawreads'))
+
+ # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
+ file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las'))
+ io.LOG('file list: %r' % file_list)
+
+ # same, shoud we decide this as a parameter
+ db_fn = os.path.join(rawread_dir, 'raw_reads.db')
+ n_core = min(n_core, len(file_list))
+ exe_pool = Pool(n_core)
+ try:
+ run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
+ io.LOG('finished track_reads')
+ except:
+ io.LOG('terminating track_reads workers...')
+ exe_pool.terminate()
+ raise
+
+def track_reads(n_core, base_dir, min_len, bestn, debug, silent, stream):
+ if debug:
+ n_core = 0
+ silent = False
+ if silent:
+ io.LOG = io.write_nothing
+ if stream:
+ global Reader
+ Reader = io.StreamedProcessReaderContext
+
+ try_run_track_reads(n_core, base_dir, min_len, bestn)
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(description='scan the raw read overlap information to identify the best hit from the reads \
+to the contigs with read_to_contig_map generated by `fc_get_read_hctg_map` in `4-quiver/read_maps/read_to_contig_map`. Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--n_core', type=int, default=4,
+ help='number of processes used for for tracking reads; '
+ '0 for main process only')
+ #parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+ #parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
+ parser.add_argument('--base_dir', type=str, default="./", help='the base working dir of a FALCON assembly')
+ parser.add_argument('--min_len', type=int, default=2500, help='min length of the reads')
+ parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+ parser.add_argument('--debug', '-g', action='store_true', help='single-threaded, plus other aids to debugging')
+ parser.add_argument('--silent', action='store_true', help='suppress cmd reporting on stderr')
+ parser.add_argument('--bestn', type=int, default=40, help='keep best n hits')
+ args = parser.parse_args(argv[1:])
+ return args
+
+def main(argv=sys.argv):
+ args = parse_args(argv)
+ track_reads(**vars(args))
+
+if __name__ == '__main__':
+ main()
diff --git a/FALCON_unzip/falcon_unzip/run_quiver.py b/FALCON_unzip/falcon_unzip/run_quiver.py
new file mode 100644
index 0000000..4eb20e6
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/run_quiver.py
@@ -0,0 +1,394 @@
+from falcon_kit import run_support as support
+from pypeflow.simple_pwatcher_bridge import (
+ PypeLocalFile, makePypeLocalFile, fn,
+ PypeTask,
+ PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
+from falcon_kit.FastaReader import FastaReader
+import glob
+import json
+import logging
+import os
+import pprint
+import re
+import sys
+import time
+import ConfigParser
+
+
+LOG = logging.getLogger(__name__)
+
+def system(call, check=False):
+ LOG.debug('$(%s)' %repr(call))
+ rc = os.system(call)
+ msg = 'Call %r returned %d.' % (call, rc)
+ if rc:
+ LOG.warning(msg)
+ if check:
+ raise Exception(msg)
+ else:
+ LOG.debug(msg)
+ return rc
+
+def mkdir(d):
+ if not os.path.isdir(d):
+ os.makedirs(d)
+
+def task_track_reads(self):
+ job_done = fn(self.job_done)
+ wd = self.parameters['wd']
+ config = self.parameters['config']
+ input_bam_fofn = config['input_bam_fofn']
+ sge_track_reads = config['sge_track_reads']
+ script_dir = os.path.join(wd)
+ script_fn = os.path.join(script_dir, 'track_reads_h.sh')
+
+ script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+cd {wd}
+hostname
+date
+fc_get_read_hctg_map.py --basedir ../..
+fc_rr_hctg_track.py --base_dir ../..
+mkdir -p 4-quiver/reads/
+fc_select_reads_from_bam.py --basedir ../.. {input_bam_fofn}
+date
+touch {job_done}
+""".format(**locals())
+
+ with open(script_fn,'w') as script_file:
+ script_file.write(script)
+ self.generated_script_fn = script_fn
+ #job_data['sge_option'] = sge_track_reads
+
+
+def task_run_quiver(self):
+
+ ref_fasta = fn(self.ref_fasta)
+ read_sam = fn(self.read_sam)
+
+ cns_fasta = fn(self.cns_fasta)
+ cns_fastq = fn(self.cns_fastq)
+ job_done = fn(self.job_done)
+
+ job_uid = self.parameters['job_uid']
+ wd = self.parameters['wd']
+ config = self.parameters['config']
+ ctg_id = self.parameters['ctg_id']
+
+ smrt_bin = config['smrt_bin']
+ sge_quiver = config['sge_quiver']
+ job_type = config['job_type']
+ samtools = os.path.join(smrt_bin, 'samtools')
+ pbalign = os.path.join(smrt_bin, 'pbalign')
+ makePbi = os.path.join(smrt_bin, 'makePbi')
+ variantCaller = os.path.join(smrt_bin, 'variantCaller')
+
+ script_dir = os.path.join(wd)
+ script_fn = os.path.join(script_dir , 'cns_%s.sh' % (ctg_id))
+
+ script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+hostname
+date
+cd {wd}
+
+{samtools} faidx {ref_fasta}
+{samtools} view -b -S {read_sam} > {ctg_id}.bam
+{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\
+ --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\
+ --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\
+ {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam
+#{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam
+({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\
+ -o {cns_fasta} -o {cns_fastq}) || echo quvier failed
+date
+touch {job_done}
+""".format(**locals())
+
+ with open(script_fn,'w') as script_file:
+ script_file.write(script)
+ self.generated_script_fn = script_fn
+ #job_data['sge_option'] = sge_quiver
+
+def rm(f):
+ system('rm -f {}'.format(f))
+def touch(f):
+ system('touch {}'.format(f))
+def task_cns_zcat(self):
+ gathered_p_ctg_fn = fn(self.gathered_p_ctg)
+ gathered_h_ctg_fn = fn(self.gathered_h_ctg)
+
+ cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta)
+ cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq)
+ cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta)
+ cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq)
+ job_done_fn = fn(self.job_done)
+
+ rm(cns_p_ctg_fasta_fn)
+ touch(cns_p_ctg_fasta_fn)
+ rm(cns_p_ctg_fastq_fn)
+ touch(cns_p_ctg_fastq_fn)
+ with open(gathered_p_ctg_fn) as ifs:
+ for line in ifs:
+ cns_fasta_fn, cns_fastq_fn = line.split()
+ system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format(**locals()))
+ system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format(**locals()))
+ with open(gathered_p_ctg_fn) as ifs:
+ for line in ifs:
+ cns_fasta_fn, cns_fastq_fn = line.split()
+ rm(cns_fasta_fn)
+ rm(cns_fasta_fn)
+ rm(cns_h_ctg_fasta_fn)
+ touch(cns_h_ctg_fasta_fn)
+ rm(cns_h_ctg_fastq_fn)
+ touch(cns_h_ctg_fastq_fn)
+ with open(gathered_h_ctg_fn) as ifs:
+ for line in ifs:
+ cns_fasta_fn, cns_fastq_fn = line.split()
+ system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format(**locals()))
+ system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format(**locals()))
+ with open(gathered_h_ctg_fn) as ifs:
+ for line in ifs:
+ cns_fasta_fn, cns_fastq_fn = line.split()
+ rm(cns_fasta_fn)
+ rm(cns_fasta_fn)
+
+ touch(job_done_fn)
+
+def task_scatter_quiver(self):
+ p_ctg_fn = fn(self.p_ctg_fa)
+ h_ctg_fn = fn(self.h_ctg_fa)
+ out_json = fn(self.scattered_quiver_json)
+
+ ref_seq_data = {}
+
+ # I think this will crash if the file is empty. Maybe that is ok.
+ p_ctg_fa = FastaReader(p_ctg_fn)
+ ctg_types = {}
+ for r in p_ctg_fa:
+ rid = r.name.split()[0]
+ ref_seq_data[rid] = r.sequence
+ ctg_types[rid] = 'p'
+
+
+ # I think this will crash if the file is empty. Maybe that is ok.
+ h_ctg_fa = FastaReader(h_ctg_fn)
+ for r in h_ctg_fa:
+ rid = r.name.split()[0]
+ ref_seq_data[rid] = r.sequence
+ ctg_types[rid] = 'h'
+
+ ctg_ids = sorted(ref_seq_data.keys())
+ #p_ctg_out=[]
+ #h_ctg_out=[]
+ #job_done_plfs = {}
+ jobs = []
+ for ctg_id in ctg_ids:
+ sequence = ref_seq_data[ctg_id]
+ m_ctg_id = ctg_id.split('-')[0]
+ wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
+ ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
+ read_sam = os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id))
+ #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
+ #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
+ #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
+
+ if os.path.exists(read_sam): # TODO(CD): Ask Jason what we should do if missing SAM. And what about network latency?
+ #if ctg_types[ctg_id] == 'p':
+ # p_ctg_out.append( (cns_fasta, cns_fastq) )
+ #if ctg_types[ctg_id] == 'h':
+ # h_ctg_out.append( (cns_fasta, cns_fastq) )
+ mkdir(wd)
+ if not os.path.exists(fn(ref_fasta)):
+ with open(fn(ref_fasta),'w') as f:
+ print >>f, '>'+ctg_id
+ print >>f, sequence
+ #parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
+ #make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam},
+ # outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
+ # parameters = parameters,
+ # )
+ #quiver_task = make_quiver_task(task_run_quiver)
+ #wf.addTask(quiver_task)
+ #job_done_plfs['{}'.format(ctg_id)] = job_done
+ new_job = {}
+ new_job['ctg_id'] = ctg_id
+ jobs.append(new_job)
+ open(out_json, 'w').write(json.dumps(jobs))
+
+def create_quiver_jobs(scattered_quiver_plf):
+ scattered_quiver_fn = fn(scattered_quiver_plf)
+ jobs = json.loads(open(scattered_quiver_fn).read())
+ #ctg_ids = sorted(jobs['ref_seq_data'])
+ p_ctg_out=[]
+ h_ctg_out=[]
+ job_done_plfs = {}
+ for job in jobs:
+ ctg_id = job['ctg_id']
+ m_ctg_id = ctg_id.split('-')[0]
+ wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
+ ref_fasta = makePypeLocalFile(os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)))
+ read_sam = makePypeLocalFile(os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id)))
+ cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
+ cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
+ job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
+
+ if os.path.exists(fn(read_sam)): # TODO(CD): Ask Jason what we should do if missing SAM.
+ if ctg_types[ctg_id] == 'p':
+ p_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
+ elif ctg_types[ctg_id] == 'h':
+ h_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
+ else:
+ LOG.warning('Type is {!r}, not "p" or "h". Why are we running Quiver?'.format(ctg_types[ctg_id]))
+ parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
+ make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam,
+ 'scattered_quiver': scattered_quiver_plf,
+ },
+ outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
+ parameters = parameters,
+ )
+ quiver_task = make_quiver_task(task_run_quiver)
+ wf.addTask(quiver_task)
+ job_done_plfs['{}'.format(ctg_id)] = job_done
+ #sge_quiver = config['sge_quiver']
+ return p_ctg_out, h_ctg_out, job_done_plfs
+
+def task_gather_quiver(self):
+ """We wrote the "gathered" files during task construction.
+ """
+ job_done_fn = fn(self.job_done)
+ touch(job_done_fn)
+
+
+def main(argv=sys.argv):
+ global LOG
+ LOG = support.setup_logger(None)
+
+
+ if len(sys.argv) < 2:
+ print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
+ sys.exit(1)
+
+ config_fn = sys.argv[1]
+ config_absbasedir = os.path.dirname(os.path.abspath(config_fn))
+
+ config = ConfigParser.ConfigParser()
+ config.read(config_fn)
+
+
+ job_type = 'SGE'
+ if config.has_option('General', 'job_type'):
+ job_type = config.get('General', 'job_type')
+
+ sge_track_reads = ' -pe smp 12 -q bigmem'
+ if config.has_option('Unzip', 'sge_track_reads'):
+ sge_track_reads = config.get('Unzip', 'sge_track_reads')
+
+ sge_quiver = ' -pe smp 24 -q bigmem '
+ if config.has_option('Unzip', 'sge_quiver'):
+ sge_quiver = config.get('Unzip', 'sge_quiver')
+
+ smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
+ if config.has_option('Unzip', 'smrt_bin'):
+ smrt_bin = config.get('Unzip', 'smrt_bin')
+
+ input_bam_fofn = 'input_bam.fofn'
+ if config.has_option('Unzip', 'input_bam_fofn'):
+ input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
+ if not os.path.isabs(input_bam_fofn):
+ input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)
+
+
+ quiver_concurrent_jobs = 8
+ if config.has_option('Unzip', 'quiver_concurrent_jobs'):
+ quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs')
+
+ config = {'job_type': job_type,
+ 'sge_quiver': sge_quiver,
+ 'sge_track_reads': sge_track_reads,
+ 'input_bam_fofn': input_bam_fofn,
+ 'smrt_bin': smrt_bin}
+ LOG.info('config={}'.format(pprint.pformat(config)))
+
+ #support.job_type = 'SGE' #tmp hack until we have a configuration parser
+
+
+ wf = PypeProcWatcherWorkflow(
+ max_jobs=quiver_concurrent_jobs,
+ )
+
+ abscwd = os.path.abspath('.')
+ parameters = {'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'), 'config': config}
+ hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention
+ track_reads_h_done_plf = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_h_done'))
+ make_track_reads_task = PypeTask(inputs = {'hasm_done': hasm_done_plf},
+ outputs = {'job_done': track_reads_h_done_plf},
+ parameters = parameters,
+ )
+ track_reads_task = make_track_reads_task(task_track_reads)
+ #sge_track_reads = config['sge_track_reads']
+
+ wf.addTask(track_reads_task)
+
+ scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json')
+ make_task = PypeTask(
+ inputs = {
+ 'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
+ 'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
+ 'track_reads_h_done': track_reads_h_done_plf,
+ },
+ outputs = {
+ 'scattered_quiver_json': scattered_quiver_plf,
+ },
+ parameters = {},
+ )
+ wf.addTask(make_task(task_scatter_quiver))
+ wf.refreshTargets()
+
+ p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(scattered_quiver_plf)
+
+ gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
+ gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
+ gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
+ mkdir('4-quiver/cns_gather')
+ with open(fn(gathered_p_ctg_plf), 'w') as ifs:
+ for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
+ ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
+ with open(fn(gathered_h_ctg_plf), 'w') as ifs:
+ for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
+ ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
+
+ make_task = PypeTask(
+ inputs = job_done_plfs,
+ outputs = {
+ 'job_done': gather_done_plf,
+ },
+ parameters = {},
+ )
+ wf.addTask(make_task(task_gather_quiver))
+ wf.refreshTargets()
+
+ cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta')
+ cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq')
+ cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta')
+ cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq')
+ zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
+ make_task = PypeTask(
+ inputs = {
+ 'gathered_p_ctg': gathered_p_ctg_plf,
+ 'gathered_h_ctg': gathered_h_ctg_plf,
+ 'gather_done': gather_done_plf,
+ },
+ outputs = {
+ 'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
+ 'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
+ 'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
+ 'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
+ 'job_done': zcat_done_plf,
+ },
+ )
+ wf.addTask(make_task(task_cns_zcat))
+
+ wf.refreshTargets()
diff --git a/FALCON_unzip/falcon_unzip/select_reads_from_bam.py b/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
new file mode 100644
index 0000000..03cbde1
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
@@ -0,0 +1,113 @@
+import pysam
+
+import argparse
+import glob
+import os
+import sys
+
+def select_reads_from_bam(input_bam_fofn_fn, asm_dir, hasm_dir, quiver_dir):
+ read_partition = {}
+ read_to_ctgs = {}
+
+ rawread_to_contigs_fn = os.path.join(quiver_dir, 'read_maps', 'rawread_to_contigs')
+ rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
+ rid_to_oid = open(rawread_ids_fn).read().split('\n')
+ with open(rawread_to_contigs_fn) as f:
+ for row in f:
+ row = row.strip().split()
+ if int(row[3]) >= 1: #keep top one hits
+ continue
+ ctg_id = row[1]
+ if ctg_id == 'NA':
+ continue
+ read_partition.setdefault( ctg_id, set())
+ r_id = row[0]
+ o_id = rid_to_oid[ int(r_id) ]
+ read_partition[ ctg_id ].add(o_id)
+ read_to_ctgs.setdefault(o_id, [])
+ read_to_ctgs[ o_id ].append( (int(row[4]) ,ctg_id) )
+ print "num read_partitions:", len(read_partition)
+ print "num read_to_ctgs:", len(read_to_ctgs)
+
+ header = None
+ fofn_basedir = os.path.normpath(os.path.dirname(input_bam_fofn_fn))
+ def abs_fn(maybe_rel_fn):
+ if os.path.isabs(maybe_rel_fn):
+ return maybe_rel_fn
+ else:
+ return os.path.join(fofn_basedir, maybe_rel_fn)
+ for row in open(input_bam_fofn_fn):
+ fn = abs_fn(row.strip())
+ samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
+ if header == None:
+ header = samfile.header
+ else:
+ header['RG'].extend(samfile.header['RG'])
+ samfile.close()
+
+ PG = header.pop('PG') #remove PG line as there might be a bug that generates no readable chrs
+ #print PG
+
+ #base_dir = os.getcwd()
+ #outfile = pysam.AlignmentFile( os.path.join(base_dir, 'header.sam' ), 'wh', header=header )
+ #outfile.close()
+
+ ctgs = read_partition.keys()
+ ctgs.sort()
+ selected_ctgs = set()
+ for ctg in ctgs:
+ picked_reads = read_partition[ ctg ]
+ print "ctg, len:", ctg, len(picked_reads) # TODO: Is this for debugging?
+ if len(picked_reads) > 20:
+ selected_ctgs.add(ctg)
+
+ outfile = {}
+
+ sam_dir = os.path.join(quiver_dir, 'reads')
+ mkdir(sam_dir)
+ for row in open(input_bam_fofn_fn):
+ fn = abs_fn(row.strip())
+ samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
+ for r in samfile.fetch( until_eof = True ):
+ if r.query_name not in read_to_ctgs:
+ #print "Missing:", r.query_name
+ continue
+ ctg_list = read_to_ctgs[ r.query_name ]
+ ctg_list.sort()
+ score, ctg = ctg_list[0]
+ if ctg not in selected_ctgs:
+ #print "Not selected:", ctg
+ continue
+ if ctg not in outfile:
+ samfile_fn = os.path.join(quiver_dir, 'reads', '%s.sam' % ctg)
+ print >>sys.stderr, 'samfile_fn:{!r}'.format(samfile_fn)
+ outfile[ctg] = pysam.AlignmentFile(samfile_fn, 'wh', header=header)
+ outfile[ctg].write(r)
+ samfile.close()
+
+ for ctg in outfile:
+ outfile[ctg].close()
+
+def mkdir(d):
+ if not os.path.isdir(d):
+ os.makedirs(d)
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(description='Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--basedir', type=str, default='./', help='the base working dir of a FALCON assembly')
+ parser.add_argument('input_bam_fofn', type=str, help='File of BAM filenames. Paths are relative to dir of FOFN, not CWD.')
+ args = parser.parse_args(argv[1:])
+ return args
+
+def main(argv=sys.argv):
+ args = parse_args(argv)
+ basedir = args.basedir
+ input_bam_fofn = args.input_bam_fofn
+ #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
+ #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
+ asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
+ hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
+ quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
+
+ select_reads_from_bam(input_bam_fofn, asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/unzip.py b/FALCON_unzip/falcon_unzip/unzip.py
new file mode 100644
index 0000000..e585fbc
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/unzip.py
@@ -0,0 +1,357 @@
+from falcon_kit import run_support as support
+from pypeflow.simple_pwatcher_bridge import (
+ PypeLocalFile, makePypeLocalFile, fn,
+ PypeTask,
+ PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
+from falcon_kit.FastaReader import FastaReader
+import glob
+import logging
+import os
+import re
+import sys
+import time
+import ConfigParser
+
+LOG = logging.getLogger(__name__)
+
+def system(call, check=False):
+ LOG.debug('$(%s)' %repr(call))
+ rc = os.system(call)
+ msg = 'Call %r returned %d.' % (call, rc)
+ if rc:
+ LOG.warning(msg)
+ if check:
+ raise Exception(msg)
+ else:
+ LOG.debug(msg)
+ return rc
+
+def mkdir(d):
+ if not os.path.isdir(d):
+ os.makedirs(d)
+
+def task_track_reads(self):
+ job_done = fn(self.job_done)
+ wd = self.parameters['wd']
+ config = self.parameters['config']
+ sge_track_reads = config['sge_track_reads']
+ script_dir = os.path.join(wd)
+ script_fn = os.path.join(script_dir , 'track_reads.sh')
+ topdir = '../..'
+
+ script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+hostname
+date
+cd {topdir}
+python -m falcon_kit.mains.get_read_ctg_map
+python -m falcon_kit.mains.rr_ctg_track
+python -m falcon_kit.mains.pr_ctg_track
+#mkdir -p 3-unzip/reads/
+python -m falcon_kit.mains.fetch_reads
+cd {wd}
+date
+touch {job_done}
+""".format(**locals())
+
+ with open(script_fn,'w') as script_file:
+ script_file.write(script)
+ self.generated_script_fn = script_fn
+ #job_data['sge_option'] = sge_track_reads
+
+
+def task_run_blasr(self):
+ job_done = fn(self.job_done)
+ ref_fasta = fn(self.ref_fasta)
+ read_fasta = fn(self.read_fasta)
+
+ job_uid = self.parameters['job_uid']
+ wd = self.parameters['wd']
+ ctg_id = self.parameters['ctg_id']
+
+ config = self.parameters['config']
+ smrt_bin = config['smrt_bin']
+ sge_blasr_aln = config['sge_blasr_aln']
+ job_type = config['job_type']
+ blasr = os.path.join(smrt_bin, 'blasr')
+ samtools = os.path.join(smrt_bin, 'samtools')
+
+
+ script_dir = os.path.join(wd)
+ script_fn = os.path.join(script_dir , 'aln_{ctg_id}.sh'.format(ctg_id = ctg_id))
+
+ script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+cd {wd}
+hostname
+date
+cd {wd}
+time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\
+ -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\
+ -minMatch 12 -nproc 24 -sam -out tmp_aln.sam
+{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted
+{samtools} index {ctg_id}_sorted.bam
+rm tmp_aln.sam
+date
+touch {job_done}
+""".format(**locals())
+
+ with open(script_fn,'w') as script_file:
+ script_file.write(script)
+ self.generated_script_fn = script_fn
+ #job_data['sge_option'] = sge_blasr_aln
+
+
+def task_phasing(self):
+ ref_fasta = fn(self.ref_fasta)
+ aln_bam = fn(self.aln_bam)
+
+ job_done = fn(self.job_done)
+
+ job_uid = self.parameters['job_uid']
+ wd = self.parameters['wd']
+ ctg_id = self.parameters['ctg_id']
+
+ config = self.parameters['config']
+ sge_phasing = config['sge_phasing']
+ job_type = config['job_type']
+
+ script_dir = os.path.join(wd)
+ script_fn = os.path.join(script_dir , 'p_%s.sh' % (ctg_id))
+
+ script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+cd {wd}
+hostname
+date
+cd {wd}
+fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir ..
+fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads
+date
+touch {job_done}
+""".format(**locals())
+
+ with open(script_fn,'w') as script_file:
+ script_file.write(script)
+ self.generated_script_fn = script_fn
+ #job_data['sge_option'] = sge_phasing
+
+
+def task_hasm(self):
+ rid_to_phase_all = fn(self.rid_to_phase_all)
+ job_done = fn(self.job_done)
+ config = self.parameters['config']
+ sge_hasm = config['sge_hasm']
+
+ wd = self.parameters['wd']
+
+ job_type = config['job_type']
+
+ script_dir = os.path.join(wd)
+ script_fn = os.path.join(script_dir , 'hasm.sh')
+
+ las_fofn = '../../2-asm-falcon/las.fofn'
+ las_fofn = '../../1-preads_ovl/merge-gather/las.fofn'
+ script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+hostname
+date
+cd {wd}
+
+fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl
+fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
+if [ -e ../../1-preads_ovl/preads4falcon.fasta ];
+then
+ ln -sf ../../1-preads_ovl/preads4falcon.fasta .
+else
+ ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta .
+fi
+fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta
+
+# more script -- a little bit hacky here, we should improve
+
+WD=$PWD
+for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done
+
+## prepare for quviering the haplotig
+cd $WD/..
+if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi
+if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi
+if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi
+if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi
+if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi
+
+find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
+find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
+find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
+find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
+find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
+find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
+cd ../
+date
+touch {job_done}
+""".format(**locals())
+
+ with open(script_fn,'w') as script_file:
+ script_file.write(script)
+ self.generated_script_fn = script_fn
+ #job_data['sge_option'] = sge_hasm
+
+def unzip_all(config):
+ unzip_concurrent_jobs = config['unzip_concurrent_jobs']
+ wf = PypeProcWatcherWorkflow(
+ max_jobs=unzip_concurrent_jobs,
+ )
+ wf.max_jobs = unzip_concurrent_jobs
+
+ ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
+ falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
+ wdir = os.path.abspath('./3-unzip/reads')
+ parameters = {'wd': wdir, 'config': config}
+
+ job_done = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_done'))
+ make_track_reads_task = PypeTask(inputs = {'falcon_asm_done': falcon_asm_done},
+ outputs = {'job_done': job_done, 'ctg_list_file': ctg_list_file},
+ parameters = parameters,
+ wdir = wdir,
+ )
+ track_reads_task = make_track_reads_task(task_track_reads)
+
+ wf.addTask(track_reads_task)
+ wf.refreshTargets() #force refresh now, will put proper dependence later
+
+ ctg_ids = []
+ with open('./3-unzip/reads/ctg_list') as f:
+ for row in f:
+ row = row.strip()
+ ctg_ids.append(row)
+
+ aln1_outs = {}
+
+ all_ctg_out = {}
+
+ for ctg_id in ctg_ids:
+ # inputs
+ ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
+ read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))
+
+ # outputs
+ wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))
+ #mkdir(wd)
+ blasr_dir = os.path.join(wd, 'blasr')
+ ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))
+ job_done = makePypeLocalFile(os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id = ctg_id)))
+
+ parameters = {'job_uid':'aln-'+ctg_id, 'wd': blasr_dir, 'config':config, 'ctg_id': ctg_id}
+ make_blasr_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_fasta': read_fasta},
+ outputs = {'ctg_aln_out': ctg_aln_out, 'job_done': job_done},
+ parameters = parameters,
+ )
+ blasr_task = make_blasr_task(task_run_blasr)
+ aln1_outs[ctg_id] = (ctg_aln_out, job_done)
+ wf.addTask(blasr_task)
+
+ phasing_dir = os.path.join(wd, 'phasing')
+ job_done = makePypeLocalFile(os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id = ctg_id)))
+ rid_to_phase_out = makePypeLocalFile(os.path.join(wd, 'rid_to_phase.{ctg_id}'.format(ctg_id = ctg_id))) # TODO: ???
+ all_ctg_out[ 'r2p.{ctg_id}'.format(ctg_id = ctg_id) ] = rid_to_phase_out # implicit output?
+
+ parameters = {'job_uid':'ha-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id}
+ make_phasing_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'aln_bam':ctg_aln_out},
+ outputs = {'job_done': job_done},
+ parameters = parameters,
+ )
+ phasing_task = make_phasing_task(task_phasing)
+ wf.addTask(phasing_task)
+
+ wf.refreshTargets()
+
+ hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
+ #mkdir(hasm_wd)
+ rid_to_phase_all = makePypeLocalFile(os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
+ task = PypeTask(inputs = all_ctg_out, outputs = {'rid_to_phase_all': rid_to_phase_all},
+ ) (get_rid_to_phase_all)
+ wf.addTask(task)
+
+ parameters['wd'] = hasm_wd
+ job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
+ make_hasm_task = PypeTask(inputs = {'rid_to_phase_all': rid_to_phase_all},
+ outputs = {'job_done': job_done},
+ parameters = parameters,
+ )
+ hasm_task = make_hasm_task(task_hasm)
+
+ wf.addTask(hasm_task)
+
+ wf.refreshTargets()
+
+def get_rid_to_phase_all(self):
+ # Tasks must be at module scope now.
+ rid_to_phase_all_fn = fn(self.rid_to_phase_all)
+ inputs_fn = [ fn(f) for f in self.inputs.values() ]
+ inputs_fn.sort()
+ output = []
+ for fname in inputs_fn:
+ output.extend(open(fname).read())
+
+ out = open(rid_to_phase_all_fn, 'w')
+ out.write(''.join(output))
+ out.close()
+
+def main(argv=sys.argv):
+ global LOG
+ LOG = support.setup_logger(None)
+
+
+ if len(argv) < 2 or argv[1].startswith('-'):
+ print 'you need to provide a configuration file to specific a couple cluster running environment'
+ sys.exit(1)
+
+ config_fn = argv[1]
+
+ config = ConfigParser.ConfigParser()
+ config.read(config_fn)
+
+ job_type = 'SGE'
+ if config.has_option('General', 'job_type'):
+ job_type = config.get('General', 'job_type')
+
+ sge_blasr_aln = ' -pe smp 24 -q bigmem '
+ if config.has_option('Unzip', 'sge_blasr_aln'):
+ sge_blasr_aln = config.get('Unzip', 'sge_blasr_aln')
+
+ smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
+ if config.has_option('Unzip', 'smrt_bin'):
+ smrt_bin = config.get('Unzip', 'smrt_bin')
+
+ sge_phasing = ' -pe smp 12 -q bigmem'
+ if config.has_option('Unzip', 'sge_phasing'):
+ sge_phasing = config.get('Unzip', 'sge_phasing')
+
+ sge_hasm = ' -pe smp 48 -q bigmem'
+ if config.has_option('Unzip', 'sge_hasm'):
+ sge_hasm = config.get('Unzip', 'sge_hasm')
+
+ sge_track_reads = ' -pe smp 12 -q bigmem'
+ if config.has_option('Unzip', 'sge_track_reads'):
+ sge_track_reads = config.get('Unzip', 'sge_track_reads')
+
+ unzip_concurrent_jobs = 8
+ if config.has_option('Unzip', 'unzip_concurrent_jobs'):
+ unzip_concurrent_jobs = config.getint('Unzip', 'unzip_concurrent_jobs')
+
+ config = {'job_type': job_type,
+ 'sge_blasr_aln': sge_blasr_aln,
+ 'smrt_bin': smrt_bin,
+ 'sge_phasing': sge_phasing,
+ 'sge_hasm': sge_hasm,
+ 'sge_track_reads': sge_track_reads,
+ 'unzip_concurrent_jobs':unzip_concurrent_jobs}
+
+ #support.job_type = 'SGE' #tmp hack until we have a configuration parser
+
+ unzip_all(config)
diff --git a/FALCON_unzip/setup.py b/FALCON_unzip/setup.py
new file mode 100755
index 0000000..b54466c
--- /dev/null
+++ b/FALCON_unzip/setup.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+from setuptools import setup
+
+from distutils.core import Extension
+
+import glob
+
+install_requires=[ "networkx >= 1.7", "pysam >= 0.8.4" ]
+
+scripts = glob.glob("src/py_scripts/*.py")
+
+setup(name='falcon_unzip',
+ version='0.1.0',
+ description='Falcon unzip',
+ author='Jason Chin',
+ author_email='jchin at pacificbiosciences.com',
+ packages=['falcon_unzip'],
+ package_dir={'falcon_unzip':'falcon_unzip/'},
+ scripts = scripts,
+ zip_safe = False,
+ install_requires=install_requires
+ )
+
diff --git a/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
new file mode 100644
index 0000000..71b60f5
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
@@ -0,0 +1,4 @@
+from falcon_unzip.dedup_h_tigs import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py b/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
new file mode 100644
index 0000000..09af64b
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
@@ -0,0 +1,4 @@
+from falcon_unzip.get_read_hctg_map import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
new file mode 100644
index 0000000..e9fcb1f
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
@@ -0,0 +1,4 @@
+from falcon_unzip.graphs_to_h_tigs import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py b/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
new file mode 100644
index 0000000..c3a38d2
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
@@ -0,0 +1,6 @@
+from falcon_unzip.ovlp_filter_with_phase import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
+
+
diff --git a/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py b/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
new file mode 100644
index 0000000..2c58c5a
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
@@ -0,0 +1,5 @@
+from falcon_unzip.phased_ovlp_to_graph import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
+
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing.py b/FALCON_unzip/src/py_scripts/fc_phasing.py
new file mode 100644
index 0000000..70c4711
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_phasing.py
@@ -0,0 +1,5 @@
+from falcon_unzip.phasing import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
+
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py b/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
new file mode 100644
index 0000000..3fa3592
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
@@ -0,0 +1,4 @@
+from falcon_unzip.phasing_readmap import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_quiver.py b/FALCON_unzip/src/py_scripts/fc_quiver.py
new file mode 100644
index 0000000..21a5af5
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_quiver.py
@@ -0,0 +1,4 @@
+from falcon_unzip.run_quiver import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py b/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
new file mode 100644
index 0000000..b68b43a
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
@@ -0,0 +1,4 @@
+from falcon_unzip.rr_hctg_track import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py b/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
new file mode 100644
index 0000000..0e85055
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
@@ -0,0 +1,4 @@
+from falcon_unzip.select_reads_from_bam import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py b/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
new file mode 100644
index 0000000..f84dc30
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
@@ -0,0 +1,398 @@
+#from pypeflow.common import *
+#from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
+#from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
+#from pypeflow.controller import PypeWorkflow, PypeMPWorkflow, PypeThreadWorkflow
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+ makePypeLocalFile, fn, PypeTask)
+PypeThreadTaskBase = MyFakePypeThreadTaskBase
+from falcon_kit.FastaReader import FastaReader
+from falcon_kit.fc_asm_graph import AsmGraph
+import glob
+import sys
+import subprocess as sp
+import shlex
+import os
+
+def make_dirs(d):
+ if not os.path.isdir(d):
+ os.makedirs(d)
+
+rawread_dir = os.path.abspath( "./0-rawreads" )
+pread_dir = os.path.abspath( "./1-preads_ovl" )
+asm_dir = os.path.abspath( os.path.join("./3-unzip/") )
+
+read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
+make_dirs(read_map_dir)
+
+wf = PypeProcWatcherWorkflow(
+ max_jobs=12,
+)
+
+rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
+rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) )
+
+ at PypeTask( inputs = {"rawread_db": rawread_db},
+ outputs = {"rawread_id_file": rawread_id_file},
+ TaskType = PypeThreadTaskBase,
+ URL = "task://localhost/dump_rawread_ids" )
+def dump_rawread_ids(self):
+ rawread_db = fn( self.rawread_db )
+ rawread_id_file = fn( self.rawread_id_file )
+ os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )
+
+wf.addTask( dump_rawread_ids )
+
+pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
+pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )
+
+ at PypeTask( inputs = {"pread_db": pread_db},
+ outputs = {"pread_id_file": pread_id_file},
+ TaskType = PypeThreadTaskBase,
+ URL = "task://localhost/dump_pread_ids" )
+def dump_pread_ids(self):
+ pread_db = fn( self.pread_db )
+ pread_id_file = fn( self.pread_id_file )
+ os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file) )
+
+wf.addTask( dump_pread_ids )
+wf.refreshTargets() # block
+
+all_raw_las_files = {}
+for las_fn in glob.glob( os.path.join( rawread_dir, "m*/raw_reads.*.las") ):
+ idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+ idx = int(idx.split(".")[1])
+ las_file = makePypeLocalFile( las_fn )
+ all_raw_las_files["r_las_%s" % idx] = las_file
+
+all_pread_las_files = {}
+for las_fn in glob.glob( os.path.join( pread_dir, "m*/preads.*.las") ):
+ idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+ idx = int(idx.split(".")[1])
+ las_file = makePypeLocalFile( las_fn )
+ all_pread_las_files["p_las_%s" % idx] = las_file
+
+
+
+h_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_edges") )
+p_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_p_ctg_edges") )
+h_ctg_ids = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_ids") )
+
+inputs = { "rawread_id_file": rawread_id_file,
+ "pread_id_file": pread_id_file,
+ "h_ctg_edges": h_ctg_edges,
+ "p_ctg_edges": p_ctg_edges,
+ "h_ctg_ids": h_ctg_ids}
+
+read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map") )
+
+ at PypeTask( inputs = inputs,
+ outputs = {"read_to_contig_map": read_to_contig_map},
+ TaskType = PypeThreadTaskBase,
+ URL = "task://localhost/get_ctg_read_map" )
+
+def generate_read_to_ctg_map(self):
+ rawread_id_file = fn( self.rawread_id_file )
+ pread_id_file = fn( self.pread_id_file )
+ read_to_contig_map = fn( self.read_to_contig_map )
+
+ pread_did_to_rid = open(pread_id_file).read().split("\n")
+ rid_to_oid = open(rawread_id_file).read().split("\n")
+
+ h_ctg_edges = fn( self.h_ctg_edges )
+ p_ctg_edges = fn( self.p_ctg_edges )
+
+ h_ctg_ids = set()
+ with open(fn(self.h_ctg_ids)) as f:
+ for row in f:
+ row = row.strip()
+ h_ctg_ids.add( row )
+
+ pread_to_contigs = {}
+
+ for fnanme in ( p_ctg_edges, h_ctg_edges):
+ with open(fnanme) as f:
+ for row in f:
+ row = row.strip().split()
+ ctg = row[0]
+ if len(ctg.split("_")) > 1 and ctg not in h_ctg_ids:
+ continue
+ n1 = row[1]
+ n2 = row[2]
+ pid1 = int(n1.split(":")[0])
+ pid2 = int(n2.split(":")[0])
+ rid1 = pread_did_to_rid[pid1].split("/")[1]
+ rid2 = pread_did_to_rid[pid2].split("/")[1]
+ rid1 = int(int(rid1)/10)
+ rid2 = int(int(rid2)/10)
+ oid1 = rid_to_oid[rid1]
+ oid2 = rid_to_oid[rid2]
+ k1 = (pid1, rid1, oid1)
+ pread_to_contigs.setdefault( k1, set() )
+ pread_to_contigs[ k1 ].add( ctg )
+ k2 = (pid2, rid2, oid2)
+ pread_to_contigs.setdefault( k2, set() )
+ pread_to_contigs[ k2 ].add( ctg )
+
+ with open(read_to_contig_map, "w") as f:
+ for k in pread_to_contigs:
+ pid, rid, oid = k
+ for ctg in list(pread_to_contigs[ k ]):
+ print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
+
+wf.addTask( generate_read_to_ctg_map )
+
+def dump_rawread_to_ctg(self):
+ rawread_db = fn(self.rawread_db)
+ rawread_id_file = fn(self.rawread_id_file)
+ phased_read_file = fn(self.phased_reads)
+ las_file = fn(self.las_file)
+ rawread_to_contig_file = fn(self.rawread_to_contig_file)
+ read_to_contig_map = fn(self.read_to_contig_map)
+ rid_to_oid = open(rawread_id_file).read().split('\n')
+
+
+ ovlp_data = []
+ ovlp_count = 0
+ longest_ovlp = 0
+ a_id = None
+ rid_to_contigs = {}
+
+ with open(read_to_contig_map) as f:
+ for row in f:
+ row = row.strip().split()
+ pid, rid, oid, ctg = row
+ rid = int(rid)
+ rid_to_contigs.setdefault( rid, (oid, set() ) )
+ rid_to_contigs[ rid ][1].add( ctg )
+
+ oid_to_phase = {}
+ with open(phased_read_file) as f:
+ for row in f:
+ row = row.strip().split()
+ ctg_id, block, phase = row[1:4]
+ oid = row[6]
+ block = int(block)
+ phase = int(phase)
+ oid_to_phase[ oid ] = (ctg_id, block, phase)
+
+
+ with open(rawread_to_contig_file, "w") as f:
+ ovlp_data = {}
+ cur_read_id = None
+ skip_rest = 0
+ for row in sp.check_output(shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file)) ).splitlines():
+
+ row = row.strip().split()
+ t_id = int(row[1])
+ q_id = int(row[0])
+ if q_id != cur_read_id:
+ if cur_read_id == None:
+ cur_read_id = q_id
+ else:
+ if len(ovlp_data) == 0:
+ o_id = rid_to_oid[ cur_read_id ]
+ print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
+ if len(ovlp_data) != 0:
+ ovlp_v = ovlp_data.values()
+ ovlp_v.sort()
+ rank = 0
+ for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+ print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+ rank += 1
+ ovlp_data = {}
+ cur_read_id = q_id
+ skip_rest = 0
+
+ if q_id in rid_to_contigs and len(ovlp_data) == 0: #if the query is already an edge of some contig....
+ t_o_id, ctgs = rid_to_contigs[ q_id ]
+ o_id = rid_to_oid[ q_id ]
+ for ctg in list(ctgs):
+ ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
+ ovlp_data[ctg][0] = -int(row[7])
+ ovlp_data[ctg][1] += 1
+ skip_rest = 1
+
+ if skip_rest == 1:
+ continue
+
+ if t_id not in rid_to_contigs:
+ continue
+
+ q_phase = oid_to_phase.get( rid_to_oid[q_id], None )
+ if q_phase != None:
+ ctg_id, block, phase = q_phase
+ if block != -1:
+ t_phase = oid_to_phase.get( rid_to_oid[t_id], None )
+ if t_phase != None:
+ if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
+ continue
+
+ t_o_id, ctgs = rid_to_contigs[ t_id ]
+ o_id = rid_to_oid[ q_id ]
+
+ for ctg in list(ctgs):
+ ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
+ ovlp_data[ctg][0] += int(row[2])
+ ovlp_data[ctg][1] += 1
+
+ if len(ovlp_data) != 0:
+ ovlp_v = ovlp_data.values()
+ ovlp_v.sort()
+ rank = 0
+ for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+ print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+ rank += 1
+
+def dump_pread_to_ctg(self):
+ pread_db = fn( self.pread_db )
+ rawread_id_file = fn( self.rawread_id_file )
+ pread_id_file = fn( self.pread_id_file )
+ phased_read_file = fn( self.phased_reads)
+ read_to_contig_map = fn( self.read_to_contig_map )
+ las_file = fn( self.las_file )
+ pread_to_contig_file = fn( self.pread_to_contig_file )
+ read_to_contig_map = fn( self.read_to_contig_map )
+
+ pid_to_rid = open(pread_id_file).read().split("\n")
+ rid_to_oid = open(rawread_id_file).read().split("\n")
+
+
+ ovlp_data = []
+ ovlp_count = 0
+ longest_ovlp = 0
+ a_id = None
+ pid_to_contigs = {}
+
+ with open(read_to_contig_map) as f:
+ for row in f:
+ row = row.strip().split()
+ pid, rid, oid, ctg = row
+ pid = int(pid)
+ pid_to_contigs.setdefault( pid, (oid, set() ) )
+ pid_to_contigs[ pid ][1].add( ctg )
+
+ oid_to_phase = {}
+ with open(phased_read_file) as f:
+ for row in f:
+ row = row.strip().split()
+ ctg_id, block, phase = row[1:4]
+ oid = row[6]
+ block = int(block)
+ phase = int(phase)
+ oid_to_phase[ oid ] = (ctg_id, block, phase)
+
+ with open(pread_to_contig_file, "w") as f:
+ ovlp_data = {}
+ cur_read_id = None
+ skip_rest = 0
+ for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file)) ).splitlines():
+
+ row = row.strip().split()
+ t_id = int(row[1])
+ q_id = int(row[0])
+ if q_id != cur_read_id:
+ if cur_read_id == None:
+ cur_read_id = q_id
+ else:
+ if len(ovlp_data) == 0:
+ rid = pid_to_rid[cur_read_id].split("/")[1]
+ rid = int(int(rid)/10)
+ o_id = rid_to_oid[ rid ]
+ print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
+ else:
+ ovlp_v = ovlp_data.values()
+ ovlp_v.sort()
+ rank = 0
+ for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+ print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+ rank += 1
+ ovlp_data = {}
+ cur_read_id = q_id
+ skip_rest = 0
+
+ if q_id in pid_to_contigs and len(ovlp_data) == 0: #if the query is in some contig....
+ t_o_id, ctgs = pid_to_contigs[ q_id ]
+ rid = pid_to_rid[q_id].split("/")[1]
+ rid = int(int(rid)/10)
+ o_id = rid_to_oid[ rid ]
+ for ctg in list(ctgs):
+ ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
+ ovlp_data[ctg][0] = -int(row[7])
+ ovlp_data[ctg][1] += 1
+ skip_rest = 1
+
+ if skip_rest == 1:
+ continue
+
+ if t_id not in pid_to_contigs:
+ continue
+
+ q_rid = int( int(pid_to_rid[q_id].split("/")[1])/10 )
+ q_phase = oid_to_phase.get( rid_to_oid[ q_rid ], None )
+
+ if q_phase != None:
+ ctg_id, block, phase = q_phase
+ if block != -1:
+ t_rid = int( int(pid_to_rid[t_id].split("/")[1])/10 )
+ t_phase = oid_to_phase.get( rid_to_oid[ t_rid ], None )
+ if t_phase != None:
+ if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
+ continue
+
+ t_o_id, ctgs = pid_to_contigs[ t_id ]
+ rid = pid_to_rid[q_id].split("/")[1]
+ rid = int(int(rid)/10)
+ o_id = rid_to_oid[ rid ]
+
+ for ctg in list(ctgs):
+ ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
+ ovlp_data[ctg][0] += int(row[2])
+ ovlp_data[ctg][1] += 1
+
+ if len(ovlp_data) != 0:
+ ovlp_v = ovlp_data.values()
+ ovlp_v.sort()
+ rank = 0
+ for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+ print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+ rank += 1
+
+
+phased_reads = makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads"))
+
+
+for las_key, las_file in all_raw_las_files.items():
+ las_fn = fn(las_file)
+ idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+ idx = int(idx.split(".")[1])
+ rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
+ make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file,
+ "rawread_db": rawread_db,
+ "read_to_contig_map": read_to_contig_map,
+ "rawread_id_file": rawread_id_file,
+ "pread_id_file": pread_id_file,
+ "phased_reads" : phased_reads},
+ outputs = { "rawread_to_contig_file": rawread_to_contig_file },
+ TaskType = PypeThreadTaskBase,
+ URL = "task://localhost/r_read_to_contigs.%s" % idx )
+ dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)
+ wf.addTask( dump_rawread_to_ctg_task )
+
+for las_key, las_file in all_pread_las_files.items():
+ las_fn = fn(las_file)
+ idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+ idx = int(idx.split(".")[1])
+ pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
+ make_dump_pread_to_ctg = PypeTask( inputs = { "las_file": las_file,
+ "pread_db": pread_db,
+ "read_to_contig_map": read_to_contig_map,
+ "rawread_id_file": rawread_id_file,
+ "pread_id_file": pread_id_file,
+ "phased_reads": phased_reads},
+ outputs = { "pread_to_contig_file": pread_to_contig_file },
+ TaskType = PypeThreadTaskBase,
+ URL = "task://localhost/pread_to_contigs.%s" % idx )
+ dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)
+ wf.addTask( dump_pread_to_ctg_task )
+
+wf.refreshTargets() # block
diff --git a/FALCON_unzip/src/py_scripts/fc_unzip.py b/FALCON_unzip/src/py_scripts/fc_unzip.py
new file mode 100644
index 0000000..0e6bb71
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_unzip.py
@@ -0,0 +1,4 @@
+from falcon_unzip.unzip import main
+import sys
+if __name__ == "__main__":
+ main(sys.argv)
diff --git a/FALCON_unzip/src/py_utils/align.sh b/FALCON_unzip/src/py_utils/align.sh
new file mode 100644
index 0000000..2bcb508
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/align.sh
@@ -0,0 +1,10 @@
+~/bin/blasr_yli_0317 000000F_reads.fa 000000F_ref.fa -noSplitSubreads -clipping subread -placeRepeatsRandomly -minPctIdentity 70.0 -minMatch 12 -nproc 48 -bam -out aln.bam
+#~/bin/blasr_yli_0317 ../data/chm1_bam/all_merged.bam pa_ctg.fa -noSplitSubreads -clipping subread -placeRepeatsRandomly -minPctIdentity 70.0 -minMatch 12 -nproc 48 -bam -out test_chm1.bam
+#samtools sort -m 120G test_chm13.bam chm13_aln
+#samtools sort -m 120G test_chm1.bam chm1_aln
+#python screen_aln_chm1.py
+#python screen_aln_chm13.py
+#bamtools merge -in chm1_aln_sampled.bam -in chm13_aln_sampled.bam -out chm1_chm13_sampled_merged.bam
+#samtools sort -m 120G chm1_chm13_sampled_merged.bam chm1_chm13_sampled_merged_sorted
+#samtools index chm1_chm13_sampled_merged_sorted.bam
+#variantCaller.py --skipUnrecognizedContigs -v -j40 -r pa_ctg.fa --algorithm=quiver chm1_chm13_sampled_merged_sorted.bam -o cns.fastq.gz -o cns.fasta.gz
diff --git a/FALCON_unzip/src/py_utils/call_SV_from_mum.py b/FALCON_unzip/src/py_utils/call_SV_from_mum.py
new file mode 100644
index 0000000..ac8e441
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/call_SV_from_mum.py
@@ -0,0 +1,85 @@
+import sys
+
+mum_fn = sys.argv[1]
+
+def process_mum_data( pread_id, mum_data ):
+ delta = 0
+ pend = 0
+ qend = 0
+ p_delta = 0
+
+
+ rtn = []
+ for rpos, qpos, len_ in mum_data:
+ rpos = int(rpos)
+ qpos = int(qpos)
+ len_ = int(len_)
+ offset = rpos - qpos
+
+ if pend == 0:
+ rtn.append( ( rpos, qpos, len_, offset, rpos+len_, 0, 0, 0, pread_id) )
+ else:
+ rtn.append( ( rpos, qpos, len_, offset, rpos+len_, offset-p_offset, rpos-pend, qpos-qend, pread_id) )
+ pend = rpos+len_
+ qend = qpos+len_
+ p_offset = offset
+
+ #break into cluster
+ cluster = []
+ p_delta = 0
+ for d in rtn:
+ rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
+ if q_delta < -10:
+ continue
+ if r_delta < -10:
+ continue
+ if len(cluster) == 0: #start a new cluster
+ cluster.append([])
+ cluster[-1].append( d )
+ #p_delta = delta
+ continue
+ if delta > 30000:
+ #print delta, p_delta
+ #p_delta = 0
+ cluster.append([])
+ continue
+ cluster[-1].append( d )
+ #p_delta = delta
+ rtn = []
+ for cls_id in xrange(len(cluster)):
+ aln_size = 0
+ qpos_ary = []
+ if len(cluster[cls_id]) == 0:
+ continue
+ for d in cluster[cls_id]:
+ rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
+ aln_size += len_
+ qpos_ary.append(qpos)
+ if 1.0*(max(qpos_ary) - min(qpos_ary))/aln_size < 0.5 or aln_size < 2000:
+ continue
+
+ for d in cluster[cls_id]:
+ rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
+ rtn.append( (rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id) )
+
+
+ return rtn
+
+
+
+mum_data = []
+with open(mum_fn) as mum_file:
+ for row in mum_file:
+ row = row.strip().split()
+ if row[0] == ">":
+ if len(mum_data) > 0:
+ rtn = process_mum_data( pread_id, mum_data )
+ if len(rtn) > 0:
+ print ">",pread_id
+ for d in rtn:
+ print " ".join([str(c) for c in d])
+ pread_id = row[1]
+ mum_data = []
+ #print
+ else:
+ mum_data.append( row )
diff --git a/FALCON_unzip/src/py_utils/fetch_read.py b/FALCON_unzip/src/py_utils/fetch_read.py
new file mode 100644
index 0000000..74def7e
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/fetch_read.py
@@ -0,0 +1,12 @@
+from falcon_kit.FastaReader import FastaReader
+
+import sys
+
+f = FastaReader(sys.argv[1])
+rl = set(open(sys.argv[2]).read().split())
+for r in f:
+ rid = r.name.split()[0]
+ if rid not in rl:
+ continue
+ print ">"+rid
+ print r.sequence
diff --git a/FALCON_unzip/src/py_utils/split_bam.py b/FALCON_unzip/src/py_utils/split_bam.py
new file mode 100644
index 0000000..f5562f0
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/split_bam.py
@@ -0,0 +1,28 @@
+import subprocess, shlex
+
+
+read_to_phase = {}
+with open("phased_reads") as f:
+ for l in f:
+ l = l.strip().split()
+ rid = l[-1]
+ phase = l[1] + "_" +l[2]
+ read_to_phase[rid] = phase
+
+p = subprocess.Popen(shlex.split("samtools view -h 000207F.000207F000207F.bam" ), stdout=subprocess.PIPE)
+
+phase_to_file = {}
+header = []
+for l in p.stdout:
+ l2 = l.strip().split()
+ if l2[0][0] == "@":
+ header.append(l.strip())
+ QNAME = l2[0]
+ phase = read_to_phase.get(QNAME, None)
+ if phase != None:
+ pfn = ("./phased_sam/%s" % phase) +".sam"
+ if pfn not in phase_to_file:
+ phase_to_file[pfn] = open(pfn,"w")
+ print >> phase_to_file[pfn], "\n".join(header)
+ print >> phase_to_file[pfn], l.strip()
+
diff --git a/pypeFLOW/pwatcher/fs_based.py b/pypeFLOW/pwatcher/fs_based.py
index 0ace6c7..e22639f 100755
--- a/pypeFLOW/pwatcher/fs_based.py
+++ b/pypeFLOW/pwatcher/fs_based.py
@@ -404,7 +404,7 @@ class MetaJobSlurm(object):
sge_option = qstripped(self.mjob.job.options['sge_option'])
job_queue = self.mjob.job.options['job_queue']
cwd = os.getcwd()
- sge_cmd = 'sbatch -J {job_name} -q {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
+ sge_cmd = 'sbatch -J {job_name} -p {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
**locals())
# "By default all environment variables are propagated."
# http://slurm.schedmd.com/sbatch.html
diff --git a/pypeFLOW/pwatcher/network_based.py b/pypeFLOW/pwatcher/network_based.py
index 8866b54..126c773 100755
--- a/pypeFLOW/pwatcher/network_based.py
+++ b/pypeFLOW/pwatcher/network_based.py
@@ -625,7 +625,7 @@ class MetaJobSlurm(object):
sge_option = qstripped(self.mjob.job.options['sge_option'])
job_queue = self.mjob.job.options['job_queue']
cwd = os.getcwd()
- sge_cmd = 'sbatch -J {job_name} -q {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
+ sge_cmd = 'sbatch -J {job_name} -p {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
**locals())
# "By default all environment variables are propagated."
# http://slurm.schedmd.com/sbatch.html
diff --git a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
index 3b2fe59..5c068be 100644
--- a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
+++ b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
@@ -13,6 +13,7 @@ import pprint
import random
import sys
import time
+import traceback
LOG = logging.getLogger(__name__)
@@ -463,13 +464,13 @@ def only_path(p):
return p.path
else:
return p
-def PypeTask(inputs, outputs, TaskType=None, parameters=None, URL=None, wdir=None, name=None):
+def PypeTask(inputs, outputs, parameters=None, wdir=None, name=None):
"""A slightly messy factory because we want to support both strings and PypeLocalFiles, for now.
This can alter dict values in inputs/outputs if they were not already PypeLocalFiles.
"""
if wdir is None:
wdir = find_work_dir([only_path(v) for v in outputs.values()])
- this = _PypeTask(inputs, outputs, parameters, URL, wdir, name)
+ this = _PypeTask(inputs, outputs, parameters, wdir, name)
#basedir = os.path.basename(wdir)
basedir = this.name
if basedir in PRODUCERS:
@@ -507,15 +508,14 @@ class _PypeTask(object):
return self
def __repr__(self):
return 'PypeTask({!r}, {!r}, {!r}, {!r})'.format(self.name, self.wdir, pprint.pformat(self.outputs), pprint.pformat(self.inputs))
- def __init__(self, inputs, outputs, parameters=None, URL=None, wdir=None, name=None):
+ def __init__(self, inputs, outputs, parameters=None, wdir=None, name=None):
if parameters is None:
parameters = {}
if wdir is None:
wdir = parameters.get('wdir', name) # One of these must be a string!
if name is None:
name = os.path.relpath(wdir)
- if URL is None:
- URL = 'task://localhost/{}'.format(name)
+ URL = 'task://localhost/{}'.format(name)
self.inputs = inputs
self.outputs = outputs
self.parameters = parameters
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falcon.git
More information about the debian-med-commit
mailing list