[med-svn] [falcon] 01/04: Imported Upstream version 1.8.6
Afif Elghraoui
afif at moszumanska.debian.org
Wed Dec 28 08:13:30 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository falcon.
commit 215b42650780bb288e2f39d96ffe4b2848224609
Author: Afif Elghraoui <afif at debian.org>
Date: Tue Dec 27 23:41:51 2016 -0800
Imported Upstream version 1.8.6
---
FALCON/falcon_kit/bash.py | 36 +-
FALCON/falcon_kit/functional.py | 14 +-
FALCON/falcon_kit/mains/LAmerge.py | 51 +
FALCON/falcon_kit/mains/LAsort.py | 46 +
FALCON/falcon_kit/mains/run1.py | 14 +-
FALCON/falcon_kit/pype_tasks.py | 8 +-
FALCON/test/test_functional.py | 8 +
FALCON_unzip/LICENSE.txt | 32 -
FALCON_unzip/README.md | 1 -
FALCON_unzip/examples/fc_unzip.cfg | 17 -
FALCON_unzip/examples/unzip.sh | 2 -
FALCON_unzip/examples/unzip_obsoleted.sh | 33 -
FALCON_unzip/falcon_unzip/__init__.py | 36 -
FALCON_unzip/falcon_unzip/dedup_h_tigs.py | 70 -
FALCON_unzip/falcon_unzip/get_read_hctg_map.py | 110 --
FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py | 678 ---------
.../falcon_unzip/ovlp_filter_with_phase.py | 354 -----
FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py | 1545 --------------------
FALCON_unzip/falcon_unzip/phasing.py | 570 --------
FALCON_unzip/falcon_unzip/phasing_readmap.py | 73 -
FALCON_unzip/falcon_unzip/rr_hctg_track.py | 201 ---
FALCON_unzip/falcon_unzip/run_quiver.py | 394 -----
FALCON_unzip/falcon_unzip/select_reads_from_bam.py | 113 --
FALCON_unzip/falcon_unzip/unzip.py | 357 -----
FALCON_unzip/setup.py | 24 -
FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py | 4 -
.../src/py_scripts/fc_get_read_hctg_map.py | 4 -
FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py | 4 -
.../src/py_scripts/fc_ovlp_filter_with_phase.py | 6 -
.../src/py_scripts/fc_phased_ovlp_to_graph.py | 5 -
FALCON_unzip/src/py_scripts/fc_phasing.py | 5 -
FALCON_unzip/src/py_scripts/fc_phasing_readmap.py | 4 -
FALCON_unzip/src/py_scripts/fc_quiver.py | 4 -
FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py | 4 -
.../src/py_scripts/fc_select_reads_from_bam.py | 4 -
.../src/py_scripts/fc_track_reads_htigs0.py | 398 -----
FALCON_unzip/src/py_scripts/fc_unzip.py | 4 -
FALCON_unzip/src/py_utils/align.sh | 10 -
FALCON_unzip/src/py_utils/call_SV_from_mum.py | 85 --
FALCON_unzip/src/py_utils/fetch_read.py | 12 -
FALCON_unzip/src/py_utils/split_bam.py | 28 -
pypeFLOW/pwatcher/fs_based.py | 30 +-
pypeFLOW/pypeflow/do_task.py | 4 +
pypeFLOW/pypeflow/simple_pwatcher_bridge.py | 59 +-
44 files changed, 228 insertions(+), 5233 deletions(-)
diff --git a/FALCON/falcon_kit/bash.py b/FALCON/falcon_kit/bash.py
index 1f64a96..e0880e8 100644
--- a/FALCON/falcon_kit/bash.py
+++ b/FALCON/falcon_kit/bash.py
@@ -276,17 +276,43 @@ def scripts_merge(config, db_prefix, run_jobs_fn):
"""
with open(run_jobs_fn) as f:
mjob_data = functional.get_mjob_data(f)
+ bash_funcs = """
+# Get the real path of a link, following all links (since 'readlink -f' is not on OSX).
+# http://stackoverflow.com/questions/7665/how-to-resolve-symbolic-links-in-a-shell-script/697552
+realpath() {
+ local r=$1; local t=$(readlink $r)
+ while [ $t ]; do
+ r=$(cd $(dirname $r) && cd $(dirname $t) && pwd -P)/$(basename $t)
+ t=$(readlink $r)
+ done
+ echo $r
+}
+rmfollow() {
+ local path=$(realpath $1)
+ rm -f "${path}"
+}
+"""
#las_fns = functional.get_las_filenames(mjob_data, db_prefix) # ok, but tricky
for p_id in mjob_data:
+ #las_fn = las_fns[p_id]
+ # We already know the output .las filename by convention.
+ las_fn = '%s.%s.las' % (db_prefix, p_id)
+
bash_lines = mjob_data[p_id]
script = []
for line in bash_lines:
- script.append(line.replace('&&', ';'))
- #las_fn = las_fns[p_id]
- # We already know the output .las filename by convention.
- las_fn = '%s.%s.las' % (db_prefix, p_id)
- yield p_id, '\n'.join(script + ['']), las_fn
+ script.append(line) # Assume we no longer have "&& rm".
+ for line in bash_lines:
+ if not line.startswith('LAmerge'):
+ continue
+ las_files = [word + '.las' for word in functional.yield_args_from_line(line)]
+ assert las_fn == os.path.basename(las_files[0])
+ script.extend('rmfollow {}'.format(fn) for fn in las_files[1:])
+ break
+
+ content = bash_funcs + '\n'.join(script + [''])
+ yield p_id, content, las_fn
def script_run_consensus(config, db_fn, las_fn, out_file_bfn):
"""config: falcon_sense_option, length_cutoff
diff --git a/FALCON/falcon_kit/functional.py b/FALCON/falcon_kit/functional.py
index a799b4d..b9495d5 100644
--- a/FALCON/falcon_kit/functional.py
+++ b/FALCON/falcon_kit/functional.py
@@ -33,7 +33,10 @@ def get_daligner_job_descriptions_sans_LAcheck(run_jobs_stream, db_prefix, singl
descs = get_daligner_job_descriptions(run_jobs_stream, db_prefix, single)
result = {}
for k,v in descs.iteritems():
- result[k] = skip_LAcheck(v)
+ bash = skip_LAcheck(v)
+ bash = bash.replace('LAsort', 'python -m falcon_kit.mains.LAsort {}'.format(db_prefix))
+ bash = bash.replace('LAmerge', 'python -m falcon_kit.mains.LAmerge {}'.format(db_prefix))
+ result[k] = bash
return result
def get_daligner_job_descriptions(run_jobs_stream, db_prefix, single=False):
@@ -208,6 +211,15 @@ def get_mjob_data(run_jobs_stream):
mjob_data[p_id].append(l)
return mjob_data
+def yield_args_from_line(bash_line):
+ """Given a line of LAmerge, etc.,
+ return [output_las_fn, input_las_fn0, input_las_fn1, ...]
+ """
+ for word in bash_line.split():
+ if word.startswith('-') or word in ('LAcheck', 'LAmerge', 'LAsort'):
+ continue
+ yield word
+
_re_sub_daligner = re.compile(r'^daligner\b', re.MULTILINE)
def xform_script_for_preads(script):
daligner_exe = 'daligner_p'
diff --git a/FALCON/falcon_kit/mains/LAmerge.py b/FALCON/falcon_kit/mains/LAmerge.py
new file mode 100644
index 0000000..762ebc0
--- /dev/null
+++ b/FALCON/falcon_kit/mains/LAmerge.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+"""Usage:
+
+ LAmerge.py DB <args>
+
+Run LAcheck on each input in args. Exclude any failures from
+the arglist. Then run LAmerge on the remaining arglist.
+
+This differs from LAsort.py in that the first las file is actually
+an *explicit* output, whereas LAsort relies on *implicit* outputs.
+"""
+import sys, os
+
+def log(msg):
+ sys.stderr.write(msg + '\n')
+
+def system(call, checked=False):
+ log('!{}'.format(call))
+ rc = os.system(call)
+ if rc:
+ msg = '{} <- {!r}'.format(rc, call)
+ if checked:
+ raise Exception(msg)
+ log(msg)
+ return rc
+
+def main(argv=sys.argv):
+ db = argv[1]
+ args = argv[2:] # Skip program name
+ lass = list()
+ new_args = list()
+ new_args.append('LAmerge')
+ for arg in args:
+ if arg.startswith('-'):
+ new_args.append(arg)
+ else:
+ lass.append(arg)
+ outlas = lass[0]
+ new_args.append(outlas) # This is the output las.
+ for las in lass[1:]:
+ rc = system('LAcheck -vS {} {}.las'.format(db, las)) # Assume sorted.
+ if rc:
+ log('Skipping {}.las'.format(las))
+ else:
+ new_args.append(las)
+ system(' '.join(new_args))
+ system('LAcheck -vS {} {}.las'.format(db, outlas)) # Assume sorted.
+
+
+if __name__ == "__main__":
+ main()
diff --git a/FALCON/falcon_kit/mains/LAsort.py b/FALCON/falcon_kit/mains/LAsort.py
new file mode 100644
index 0000000..387bb47
--- /dev/null
+++ b/FALCON/falcon_kit/mains/LAsort.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+"""Usage:
+
+ LAsort.py DB <args>
+
+Run LAcheck on each input in args. Exclude any failures from
+the arglist. Then run LAsort on the remaining arglist.
+"""
+import sys, os
+
+def log(msg):
+ sys.stderr.write(msg + '\n')
+
+def system(call, checked=False):
+ log('!{}'.format(call))
+ rc = os.system(call)
+ if rc:
+ msg = '{} <- {!r}'.format(rc, call)
+ if checked:
+ raise Exception(msg)
+ log(msg)
+ return rc
+
+def main(argv=sys.argv):
+ log('argv:{!r}'.format(argv))
+ db = argv[1]
+ args = argv[2:] # Skip program name
+ lass = list()
+ new_args = list()
+ new_args.append('LAsort')
+ for arg in args:
+ if arg.startswith('-'):
+ new_args.append(arg)
+ else:
+ lass.append(arg)
+ for las in lass:
+ rc = system('LAcheck -v {} {}.las'.format(db, las))
+ if rc:
+ log('Skipping {}.las'.format(las))
+ else:
+ new_args.append(las)
+ system(' '.join(new_args))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/FALCON/falcon_kit/mains/run1.py b/FALCON/falcon_kit/mains/run1.py
index 2e5f99d..9b7bee6 100644
--- a/FALCON/falcon_kit/mains/run1.py
+++ b/FALCON/falcon_kit/mains/run1.py
@@ -121,12 +121,15 @@ def main1(prog_name, input_config_fn, logger_config_fn=None):
fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn))
raise
input_fofn_plf = makePypeLocalFile(config['input_fofn'])
+ genome_size = config.get('genome_size')
+ squash = True if 0 < genome_size < 1000000 else False
wf = PypeProcWatcherWorkflow(job_type=config['job_type'],
job_queue=config['job_queue'],
sge_option=config.get('sge_option', ''),
watcher_type=config['pwatcher_type'],
watcher_directory=config['pwatcher_directory'],
use_tmpdir=config.get('use_tmpdir'),
+ squash=squash
)
run(wf, config,
os.path.abspath(input_config_fn),
@@ -259,6 +262,9 @@ def run(wf, config,
sys.exit(0)
# Produce new FOFN of preads fasta, based on consensus of overlaps.
+ concurrent_jobs = config['cns_concurrent_jobs']
+ wf.max_jobs = concurrent_jobs
+
scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json')
make_task = PypeTask(
inputs = {
@@ -298,8 +304,6 @@ def run(wf, config,
task = make_task(pype_tasks.task_report_pre_assembly)
wf.addTask(task)
- concurrent_jobs = config['cns_concurrent_jobs']
- wf.max_jobs = concurrent_jobs
wf.refreshTargets(exitOnFailure=exitOnFailure)
@@ -341,6 +345,9 @@ def run(wf, config,
preads_nblock = support.get_nblock(fn(preads_db))
#### run daligner
+ concurrent_jobs = config['ovlp_concurrent_jobs']
+ wf.max_jobs = concurrent_jobs
+
config['sge_option_da'] = config['sge_option_pda']
scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json')
@@ -404,9 +411,6 @@ def run(wf, config,
task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las)
wf.addTask(task)
- concurrent_jobs = config['ovlp_concurrent_jobs']
- wf.max_jobs = concurrent_jobs
-
wf.refreshTargets(exitOnFailure=exitOnFailure)
diff --git a/FALCON/falcon_kit/pype_tasks.py b/FALCON/falcon_kit/pype_tasks.py
index 169bf5e..4219388 100644
--- a/FALCON/falcon_kit/pype_tasks.py
+++ b/FALCON/falcon_kit/pype_tasks.py
@@ -195,7 +195,11 @@ def task_run_las_merge(self):
gathered_dict = read_gathered_las(gathered_las_fn)
las_paths = gathered_dict[job_id]
for las_path in las_paths:
- src = os.path.relpath(las_path, cwd)
+ assert os.path.isabs(las_path)
+ if os.path.commonprefix([las_path, cwd]) == '/':
+ src = las_path
+ else:
+ src = os.path.relpath(las_path, cwd)
tgt = os.path.join(cwd, os.path.basename(las_path))
LOG.debug('symlink {!r} -> {!r}'.format(src, tgt))
os.symlink(src, tgt)
@@ -252,7 +256,7 @@ def task_daligner_scatter(self):
LOG.info('Skip LAcheck after daligner? {}'.format(skip_checks))
func = task_run_daligner
func_name = '{}.{}'.format(func.__module__, func.__name__)
- for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, db_build_done, nblock, pread_aln, skip_checks):
+ for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, db_build_done, nblock, pread_aln, skip_check=skip_checks):
job_done_fn = 'job_%s_done' %job_uid
parameters = {'daligner_script': script,
'job_uid': job_uid,
diff --git a/FALCON/test/test_functional.py b/FALCON/test/test_functional.py
index aafb6d4..bbcacaf 100644
--- a/FALCON/test/test_functional.py
+++ b/FALCON/test/test_functional.py
@@ -200,3 +200,11 @@ sample_perl_counts_output = """
def test_calc_metric_fragmentation():
frag = f.calc_metric_fragmentation(sample_perl_counts_output)
eq_(frag, 2.5)
+
+def test_args_from_line():
+ line = 'LAmerge -v preads.1 preads.1.preads.1.C0.S preads.1.preads.1.N0.S'
+ expected = ['preads.1', 'preads.1.preads.1.C0.S', 'preads.1.preads.1.N0.S']
+ result = list(f.yield_args_from_line(line))
+ helpers.equal_list(result, expected)
+ bash_lines = [line]
+ las_files = [word + '.las' for word in f.yield_args_from_line(line) for line in bash_lines if line.startswith('LAmerge')][1:]
diff --git a/FALCON_unzip/LICENSE.txt b/FALCON_unzip/LICENSE.txt
deleted file mode 100644
index 191f795..0000000
--- a/FALCON_unzip/LICENSE.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-The Clear BSD License
-
-Copyright (c) 2016, Pacific Biosciences
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted (subject to the limitations in the disclaimer
-below) provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its contributors may be used
- to endorse or promote products derived from this software without specific
- prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS
-LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGE.
diff --git a/FALCON_unzip/README.md b/FALCON_unzip/README.md
deleted file mode 100644
index fd5191c..0000000
--- a/FALCON_unzip/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Please refer to https://github.com/PacificBiosciences/FALCON_unzip/wiki
diff --git a/FALCON_unzip/examples/fc_unzip.cfg b/FALCON_unzip/examples/fc_unzip.cfg
deleted file mode 100644
index 787360c..0000000
--- a/FALCON_unzip/examples/fc_unzip.cfg
+++ /dev/null
@@ -1,17 +0,0 @@
-[General]
-job_type = SGE
-
-[Unzip]
-input_fofn= input.fofn
-input_bam_fofn= input_bam.fofn
-
-smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
-
-jobqueue = your_sge_queue
-sge_phasing= -pe smp 12 -q %(jobqueue)s
-sge_quiver= -pe smp 12 -q %(jobqueue)s
-sge_track_reads= -pe smp 12 -q %(jobqueue)s
-sge_blasr_aln= -pe smp 24 -q %(jobqueue)s
-sge_hasm= -pe smp 48 -q %(jobqueue)s
-unzip_concurrent_jobs = 64
-quiver_concurrent_jobs = 64
diff --git a/FALCON_unzip/examples/unzip.sh b/FALCON_unzip/examples/unzip.sh
deleted file mode 100644
index 529e368..0000000
--- a/FALCON_unzip/examples/unzip.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-fc_unzip.py fc_unzip.cfg
-fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/examples/unzip_obsoleted.sh b/FALCON_unzip/examples/unzip_obsoleted.sh
deleted file mode 100644
index 88d22da..0000000
--- a/FALCON_unzip/examples/unzip_obsoleted.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# identify raw reads to each contig
-fc_track_reads.py
-# get the reads assigned to each contig from the initial *.subreads.fasta files
-mkdir -p 3-unzip/reads/
-fc_fetch_reads.py
-# phasing the read
-fc_unzip.py fc_unzip.cfg
-# doing the haplotig assembly, this part will be intergrated into fc_unzip.py
-mkdir -p 3-unzip/1-hasm/
-cd 3-unzip/1-hasm/
-find ../0-phasing/ -name "rid_to_phase.*" | xargs cat > rid_to_phase.all
-fc_ovlp_filter_with_phase.py --fofn ../../2-asm-falcon/las.fofn --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map ./rid_to_phase.all > preads.p_ovl
-fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
-fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map ./rid_to_phase.all --fasta ../../1-preads_ovl/preads4falcon.fasta
-WD=$PWD
-for f in `cat ../reads/ctg_list `;do cd $WD/$f; fc_dedup_h_tigs.py $f; done
-# prepare for quviering the haplotig
-cd $WD/..
-rm all_phased_reads all_h_ctg_ids all_h_ctg_edges all_p_ctg_edges all_p_ctg.fa all_h_ctg.fa
-find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
-find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
-find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
-find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
-find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
-find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
-cd ../
-# identify raw reads to each primary contig or haplotig
-fc_track_reads_htigs.py
-# decompose PacBio raw reads with pulse information in bam files into individual bam file for each haplotig or primary contig
-mkdir -p 4-quiver/reads/
-fc_select_reads_from_bam.py input_bam.fofn
-# run blasr and quiver to generate consensus for each haplotig or primary contig
-fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/falcon_unzip/__init__.py b/FALCON_unzip/falcon_unzip/__init__.py
deleted file mode 100644
index 94e4fd5..0000000
--- a/FALCON_unzip/falcon_unzip/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#################################################################################$$
-# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
diff --git a/FALCON_unzip/falcon_unzip/dedup_h_tigs.py b/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
deleted file mode 100644
index 61bd8c3..0000000
--- a/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import os
-import sys
-from falcon_kit.FastaReader import FastaReader
-
-def main(args):
-
- ctg_id = sys.argv[1]
-
- if os.path.exists("h_ctg_all.{ctg_id}.fa".format(ctg_id = ctg_id)):
- os.system("nucmer -mum p_ctg.{ctg_id}.fa h_ctg_all.{ctg_id}.fa -p hp_aln".format(ctg_id = ctg_id))
- os.system("show-coords -T -H -l -c hp_aln.delta > hp_aln.coor")
- else:
- sys.exit(0) #it is ok if there is no h_ctg_all.{ctg_id}.fa, don't want to interupt the workflow
-
-
- if os.path.exists("hp_aln.coor"):
- filter_out = set()
- with open("hp_aln.coor") as f:
- for row in f:
- row = row.strip().split()
- q_cov = float(row[10])
- idt = float(row[6])
- if q_cov > 99 and idt > 99:
- filter_out.add(row[-1])
-
- p_ctg_to_phase = {}
- with open("p_ctg_path.%s" % ctg_id) as f:
- for row in f:
- row = row.strip().split()
- b_id, ph_id = (int(row[-2]), int(row[-1]) )
- p_ctg_to_phase.setdefault( row[0], {} )
- p_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
- p_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
-
-
- h_ctg_to_phase = {}
- with open("h_ctg_path.%s" % ctg_id) as f:
- for row in f:
- row = row.strip().split()
- b_id, ph_id = (int(row[-2]), int(row[-1]) )
- h_ctg_to_phase.setdefault( row[0], {} )
- h_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
- h_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
-
- h_ids = open("h_ctg_ids.%s" % ctg_id,"w")
- with open("h_ctg.%s.fa" % ctg_id, "w") as f:
- h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id)
- for r in h_tig_all:
- p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {})
-
- if len(r.sequence) < 500:
- continue
-
- if r.name in filter_out:
- edge_count = sum(h_ctg_to_phase[ r.name ].values())
- same_phase_to_p_ctg_count = 0
- for b_id, ph_id in h_ctg_to_phase[ r.name ]:
- if b_id != -1:
- if (b_id, ph_id) in p_ctg_phase:
- same_phase_to_p_ctg_count += h_ctg_to_phase[ r.name ][ (b_id, ph_id) ]
- unphased_edge_count = h_ctg_to_phase[ r.name ] .get( (-1, 0), 0 )
-
- print r.name, edge_count, unphased_edge_count, same_phase_to_p_ctg_count
- if edge_count - unphased_edge_count - same_phase_to_p_ctg_count < 5: # there are many non-p_ctg phase segment, do not filter out
- continue
-
- print >>f, ">"+r.name
- print >>f, r.sequence
- print >> h_ids, r.name
- h_ids.close()
diff --git a/FALCON_unzip/falcon_unzip/get_read_hctg_map.py b/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
deleted file mode 100644
index 18909f2..0000000
--- a/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
- makePypeLocalFile, fn, PypeTask)
-import argparse
-import logging
-import os
-import sys
-
-def make_dirs(d):
- if not os.path.isdir(d):
- os.makedirs(d)
-
-def generate_read_to_hctg_map(self):
- rawread_id_file = fn( self.rawread_id_file )
- pread_id_file = fn( self.pread_id_file )
- read_to_contig_map = fn( self.read_to_contig_map )
-
- pread_did_to_rid = open(pread_id_file).read().split('\n')
- rid_to_oid = open(rawread_id_file).read().split('\n')
-
- h_ctg_edges = fn( self.h_ctg_edges )
- p_ctg_edges = fn( self.p_ctg_edges )
-
- h_ctg_ids = set()
- with open(fn(self.h_ctg_ids)) as f:
- for row in f:
- row = row.strip()
- h_ctg_ids.add( row )
-
- pread_to_contigs = {}
-
- for fnanme in ( p_ctg_edges, h_ctg_edges):
- with open(fnanme) as f:
- for row in f:
- row = row.strip().split()
- ctg = row[0]
- if len(ctg.split('_')) > 1 and ctg not in h_ctg_ids:
- continue
- n1 = row[1]
- n2 = row[2]
- pid1 = int(n1.split(':')[0])
- pid2 = int(n2.split(':')[0])
- rid1 = pread_did_to_rid[pid1].split('/')[1]
- rid2 = pread_did_to_rid[pid2].split('/')[1]
- rid1 = int(int(rid1)/10)
- rid2 = int(int(rid2)/10)
- oid1 = rid_to_oid[rid1]
- oid2 = rid_to_oid[rid2]
- k1 = (pid1, rid1, oid1)
- pread_to_contigs.setdefault( k1, set() )
- pread_to_contigs[ k1 ].add( ctg )
- k2 = (pid2, rid2, oid2)
- pread_to_contigs.setdefault( k2, set() )
- pread_to_contigs[ k2 ].add( ctg )
-
- with open(read_to_contig_map, 'w') as f:
- for k in pread_to_contigs:
- pid, rid, oid = k
- for ctg in list(pread_to_contigs[ k ]):
- print >>f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
-
-def get_read_hctg_map(asm_dir, hasm_dir, quiver_dir):
- wf = PypeProcWatcherWorkflow(
- max_jobs=12, # TODO: Why was NumThreads ever set? There is only one task!
- )
-
- rawread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids'))
- pread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids'))
- h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges'))
- p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges'))
- h_ctg_ids = makePypeLocalFile( os.path.join(hasm_dir, "all_h_ctg_ids"))
- read_map_dir = os.path.join(quiver_dir, 'read_maps')
- make_dirs(read_map_dir)
-
- read_to_contig_map = makePypeLocalFile(os.path.join(read_map_dir, 'read_to_contig_map'))
-
- inputs = { 'rawread_id_file': rawread_id_file,
- 'pread_id_file': pread_id_file,
- 'h_ctg_edges': h_ctg_edges,
- 'p_ctg_edges': p_ctg_edges,
- 'h_ctg_ids': h_ctg_ids}
-
- make_task = PypeTask(
- inputs = inputs,
- outputs = {'read_to_contig_map': read_to_contig_map},
- )
- wf.addTask(make_task(generate_read_to_hctg_map))
- wf.refreshTargets() # block
-
-
-
-def parse_args(argv):
- parser = argparse.ArgumentParser(description='generate `4-quiver/read_maps/read_to_contig_map` that contains the \
-information from the chain of mapping: (contig id, last col) -> (internal p-read id) -> (internal raw-read id) -> (original read id)\n \
-it assumes the 2-asm-falcon/read_maps ... /dump_rawread_ids/rawread_ids and /dump_pread_ids/pread_ids are already generated',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--basedir', type=str, default="./", help='the base working dir of a FALCON assembly')
- args = parser.parse_args(argv[1:])
- return args
-
-def main(argv=sys.argv):
- logging.basicConfig()
- args = parse_args(argv)
- basedir = args.basedir
- #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
- #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
- asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
- hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
- quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
-
- get_read_hctg_map(asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py b/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
deleted file mode 100644
index fe7fd00..0000000
--- a/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
+++ /dev/null
@@ -1,678 +0,0 @@
-from falcon_kit.fc_asm_graph import AsmGraph
-from falcon_kit.FastaReader import FastaReader
-import os
-import networkx as nx
-from multiprocessing import Pool
-import argparse
-import re
-import sys
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-## for shared memory usage
-global p_asm_G
-global h_asm_G
-global all_rid_to_phase
-global seqs
-
-def mkdir(d):
- if not os.path.isdir(d):
- os.makedirs(d)
-
-def reverse_end( node_id ):
- node_id, end = node_id.split(":")
- new_end = "B" if end == "E" else "E"
- return node_id + ":" + new_end
-
-
-def load_sg_seq(all_read_ids, fasta_fn):
-
- seqs = {}
- # load all p-read name into memory
- f = FastaReader(fasta_fn)
- for r in f:
- if r.name not in all_read_ids:
- continue
- seqs[r.name] = r.sequence.upper()
- return seqs
-
-def generate_haplotigs_for_ctg(input_):
-
- ctg_id, out_dir = input_
- global p_asm_G
- global h_asm_G
- global all_rid_to_phase
- global seqs
- arid_to_phase = all_rid_to_phase[ctg_id]
-
- mkdir( out_dir )
-
- ctg_G = p_asm_G.get_sg_for_ctg(ctg_id)
-
- ctg_nodes = set(ctg_G.nodes())
-
- sg = nx.DiGraph()
-
- for v, w in ctg_G.edges():
-
- vrid = v[:9]
- wrid = w[:9]
-
- edge_data = p_asm_G.sg_edges[ (v, w) ]
- if edge_data[-1] != "G":
- continue
-
- vphase = arid_to_phase.get(vrid, (-1,0))
- wphase = arid_to_phase.get(wrid, (-1,0))
- if vphase[0] == wphase[0] and vphase[1] != wphase[1]:
- cross_phase = "Y"
- else:
- cross_phase = "N"
-
- sg.add_node( v, label= "%d_%d" % vphase,
- phase="%d_%d" % vphase,
- src="P" )
-
- sg.add_node( w, label= "%d_%d" % wphase,
- phase="%d_%d" % wphase,
- src="P" )
-
- sg.add_edge(v, w, src="OP", cross_phase = cross_phase)
-
- # we need to add the complimentary edges as the ctg_graph does not contain the dual edges
- rv = reverse_end(v)
- rw = reverse_end(w)
- sg.add_node( rv, label= "%d_%d" % vphase,
- phase="%d_%d" % vphase,
- src="P" )
-
- sg.add_node( rw, label= "%d_%d" % wphase,
- phase="%d_%d" % wphase,
- src="P" )
-
- sg.add_edge(rw, rv, src="OP", cross_phase = cross_phase)
-
- PG_nodes = set(sg.nodes())
- PG_edges = set(sg.edges())
-
- for v, w in h_asm_G.sg_edges:
-
- vrid = v[:9]
- wrid = w[:9]
-
- if vrid not in arid_to_phase:
- continue
- if wrid not in arid_to_phase:
- continue
-
- if (v, w) in PG_edges:
- if p_asm_G.sg_edges[(v,w)][-1] == "G":
- continue
-
- edge_data = h_asm_G.sg_edges[ (v, w) ]
-
- if edge_data[-1] != "G":
- continue
-
- cross_phase = "N"
- if v not in PG_nodes:
- sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
- phase="%d_%d" % arid_to_phase[vrid],
- src="H" )
-
- if w not in PG_nodes:
- sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
- phase="%d_%d" % arid_to_phase[wrid],
- src="H" )
-
- sg.add_edge(v, w, src="H", cross_phase = cross_phase)
-
- rv = reverse_end(v)
- rw = reverse_end(w)
- if rv not in PG_nodes:
- sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
- phase="%d_%d" % arid_to_phase[vrid],
- src="H" )
-
- if rw not in PG_nodes:
- sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
- phase="%d_%d" % arid_to_phase[wrid],
- src="H" )
-
- sg.add_edge(rw, rv, src="H", cross_phase = cross_phase)
-
- sg0 = sg.copy()
- for v, w in h_asm_G.sg_edges:
- vrid = v[:9]
- wrid = w[:9]
- if vrid not in arid_to_phase:
- continue
- if wrid not in arid_to_phase:
- continue
-
- if (v, w) in PG_edges:
- if p_asm_G.sg_edges[(v,w)][-1] == "G":
- continue
-
- edge_data = h_asm_G.sg_edges[ (v, w) ]
-
- if sg0.in_degree(w) == 0:
- cross_phase = "Y"
- if v not in PG_nodes:
- sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
- phase="%d_%d" % arid_to_phase[vrid],
- src="H" )
-
- if w not in PG_nodes:
- sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
- phase="%d_%d" % arid_to_phase[wrid],
- src="H" )
-
- sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
-
- rv = reverse_end(v)
- rw = reverse_end(w)
- if rv not in PG_nodes:
- sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
- phase="%d_%d" % arid_to_phase[vrid],
- src="H" )
-
- if rw not in PG_nodes:
- sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
- phase="%d_%d" % arid_to_phase[wrid],
- src="H" )
-
- sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
-
- if sg0.out_degree(v) == 0:
- cross_phase = "Y"
- if v not in PG_nodes:
- sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
- phase="%d_%d" % arid_to_phase[vrid],
- src="H" )
-
- if w not in PG_nodes:
- sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
- phase="%d_%d" % arid_to_phase[wrid],
- src="H" )
-
- sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
-
- rv = reverse_end(v)
- rw = reverse_end(w)
- if rv not in PG_nodes:
- sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
- phase="%d_%d" % arid_to_phase[vrid],
- src="H" )
-
- if rw not in PG_nodes:
- sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
- phase="%d_%d" % arid_to_phase[wrid],
- src="H" )
-
- sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
-
- sg2 = sg.copy()
- ctg_nodes_r = set([ reverse_end(v) for v in list(ctg_nodes) ])
- for v, w in ctg_G.edges():
- sg2.remove_edge(v, w)
- rv, rw = reverse_end(v), reverse_end(w)
- sg2.remove_edge(rw, rv)
- for v in sg2.nodes():
- if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
- sg2.remove_node(v)
-
- nodes_to_remove = set()
- edges_to_remove = set()
- for sub_g in nx.weakly_connected_component_subgraphs(sg2):
- sub_g_nodes = set(sub_g.nodes())
- if len(sub_g_nodes & ctg_nodes_r) > 0 and len(sub_g_nodes & ctg_nodes) > 0:
- # remove cross edge
- sources = [n for n in sub_g.nodes() if sub_g.in_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
- sinks = [n for n in sub_g.nodes() if sub_g.out_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
- edges_to_keep = set()
- for v in sources:
- for w in sinks:
- path = []
- if v in ctg_nodes and w not in ctg_nodes_r:
- try:
- path = nx.shortest_path( sub_g, v, w )
- except nx.exception.NetworkXNoPath:
- path = []
- elif v not in ctg_nodes and w in ctg_nodes_r:
- try:
- path = nx.shortest_path( sub_g, v, w )
- except nx.exception.NetworkXNoPath:
- path = []
-
- if len(path) >= 2:
- v1 = path[0]
- for w1 in path[1:]:
- edges_to_keep.add( (v1, w1) )
- rv1, rw1 = reverse_end(v1), reverse_end(w1)
- edges_to_keep.add( (rw1, rv1) )
- v1 = w1
- for v, w in sub_g.edges():
- if (v, w) not in edges_to_keep:
- edges_to_remove.add( (v, w) )
- rv, rw = reverse_end(v), reverse_end(w)
- edges_to_remove.add( (rw, rv) )
-
-
- if len(sub_g_nodes & ctg_nodes_r) == 0 and len(sub_g_nodes & ctg_nodes) == 0:
- nodes_to_remove.update( sub_g_nodes )
- nodes_to_remove.update( set( [reverse_end(v) for v in list(sub_g_nodes)] ) )
-
- for v, w in list(edges_to_remove):
- sg.remove_edge(v, w)
-
- for v in nodes_to_remove:
- sg.remove_node(v)
-
- for v in sg.nodes():
- if sg.out_degree(v) == 0 and sg.in_degree(v) == 0:
- sg.remove_node(v)
-
- #nx.write_gexf(sg, "full_g.gexf")
-
- s_node = p_asm_G.ctg_data[ctg_id][5][0][0]
- t_node = p_asm_G.ctg_data[ctg_id][5][-1][-1]
-
- for v, w in sg.edges():
- phase0 = sg.node[v]["phase"].split("_")
- phase1 = sg.node[w]["phase"].split("_")
- if phase0 == phase1:
- sg[v][w]["weight"] = 10
- sg[v][w]["score"] = 1
- sg[v][w]["label"] = "type0"
- else:
- if phase0[0] == phase1[0]:
- sg[v][w]["weight"] = 1
- sg[v][w]["score"] = 100000
- sg[v][w]["label"] = "type1"
- else:
- sg[v][w]["weight"] = 5
- sg[v][w]["score"] = 50
- sg[v][w]["label"] = "type2"
-
-
- sg2 = sg.copy()
- edge_to_remove = set()
- for v, w in sg2.edges():
- if sg2[v][w]["src"] == "ext":
- edge_to_remove.add( (v, w) )
- rv, rw = reverse_end(v), reverse_end(w)
- edge_to_remove.add( (rw, rv) )
-
- if sg2.node[v]["phase"] == sg2.node[w]["phase"]:
- continue
- flag1 = 0
- flag2 = 0
- for e in sg2.out_edges(v):
- if sg2.node[e[0]]["phase"] == sg2.node[e[1]]["phase"]:
- flag1 = 1
- break
- if flag1 == 1:
- for e in sg2.in_edges(w):
- if sg2.node[e[0]]["phase"] == sg2.node[e[1]]["phase"]:
- flag2 = 1
- break
- if flag2 == 1:
- edge_to_remove.add( (v, w) )
- rv, rw = reverse_end(v), reverse_end(w)
- edge_to_remove.add( (rw, rv) )
-
- for v, w in list(edge_to_remove):
- sg2.remove_edge(v, w)
-
- ## remove some new short cut due to repeats in the haplotype edges
- edge_to_remove = set()
- with open(os.path.join(out_dir, "path_len.%s" % ctg_id), "w") as f:
-
- all_length_from_s = nx.shortest_path_length(sg, source=s_node)
- edge_to_remove = set()
-
- for w in all_length_from_s:
- longest = 0
- if sg.in_degree(w) >= 2:
- for e in sg.in_edges(w):
- v = e[0]
- if v in all_length_from_s:
- len_ = all_length_from_s[v]
- if len_ > longest:
- longest = len_
- if longest == 0:
- continue
- for e in sg.in_edges(w):
- v = e[0]
- if v in all_length_from_s:
- len_ = all_length_from_s[v]
- print >>f, ctg_id, "link_lengths", v, w, longest, len_
- if longest - len_ > 10 and sg[v][w]["src"] != "OP": #make sure we always have one path drived from the original graph from s to t
- edge_to_remove.add( (v, w) )
- rv, rw = reverse_end(v), reverse_end(w)
- edge_to_remove.add( (rw, rv) )
-
- for v, w in list(edge_to_remove):
- sg.remove_edge(v, w)
- print >>f, ctg_id, "removed", v, w
-
- for v, w in list(edge_to_remove):
- try:
- sg2.remove_edge(v, w)
- except Exception:
- pass
-
-
- nx.write_gexf(sg, os.path.join(out_dir, "sg.gexf" ))
- nx.write_gexf(sg2, os.path.join(out_dir, "sg2.gexf" ))
-
-
- try:
- s_path = nx.shortest_path(sg2, source=s_node, target=t_node, weight="score")
- except nx.exception.NetworkXNoPath:
- s_path = nx.shortest_path(sg, source=s_node, target=t_node, weight="score")
-
- s_path_edges = []
- for i in xrange(len(s_path)-1):
- v = s_path[i]
- w = s_path[i+1]
- sg[v][w]["weight"] = 15
- s_path_edges.append( (v,w) )
-
- s_path_edge_set = set(s_path_edges)
-
-
-
- #output the updated primary contig
- p_tig_path = open(os.path.join(out_dir, "p_ctg_path.%s" % ctg_id),"w")
- p_tig_fa = open(os.path.join(out_dir, "p_ctg.%s.fa" % ctg_id),"w")
- edges_to_remove1 = set()
- edges_to_remove2 = set()
- with open(os.path.join(out_dir, "p_ctg_edges.%s" % ctg_id), "w") as f:
- seq = []
- for v, w in s_path_edges:
- sg[v][w]["h_edge"] = 1
- vrid = v.split(":")[0]
- wrid = w.split(":")[0]
- vphase = arid_to_phase.get(vrid, (-1,0))
- wphase = arid_to_phase.get(wrid, (-1,0))
- print >>f, "%s" % ctg_id, v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
-
- if sg.edge[v][w]["src"] == "OP":
- edge_data = p_asm_G.sg_edges[ (v,w) ]
- else:
- edge_data = h_asm_G.sg_edges[ (v,w) ]
-
- seq_id, s, t = edge_data[0]
- if s < t:
- seq.append(seqs[ seq_id ][ s:t ])
- else:
- seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ s:t:-1 ] ]))
- print >>p_tig_path, "%s" % ctg_id, v, w, seq_id, s, t, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
- sg[v][w]["tig_id"] = "%s" % ctg_id
-
- rv, rw = reverse_end(v), reverse_end(w)
- edges_to_remove1.add( (v, w) )
- edges_to_remove2.add( (rw, rv) )
-
- print >> p_tig_fa, ">%s" % ctg_id
- print >> p_tig_fa, "".join(seq)
-
- p_tig_fa.close()
- p_tig_path.close()
-
-
-
- sg2 = sg.copy()
- reachable1 = nx.descendants(sg2, s_node)
- sg2_r = sg2.reverse()
- reachable2 = nx.descendants(sg2_r, t_node)
-
- reachable_all = reachable1 | reachable2
- reachable_both = reachable1 & reachable2
-
-
- for v, w in list(edges_to_remove2 | edges_to_remove1):
- sg2.remove_edge( v, w )
-
- for v, w in sg2.edges():
- if sg2[v][w]["cross_phase"] == "Y":
- sg2.remove_edge( v, w )
-
- for v in sg2.nodes():
- if v not in reachable_all:
- sg2.remove_node(v)
-
- for v in sg2.nodes():
- if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
- sg2.remove_node(v)
- continue
- if v in reachable_both:
- sg2.node[v]["reachable"] = 1
- else:
- sg2.node[v]["reachable"] = 0
-
- dump_graph = False # the code segement below is useful for showing the graph
- if dump_graph == True:
- nx.write_gexf(sg2, "%s_1.gexf" % ctg_id)
-
- p_path_nodes = set(s_path)
- p_path_rc_nodes = set( [reverse_end(v) for v in s_path] )
-
- sg2_nodes = set(sg2.nodes())
- for v in p_asm_G.get_sg_for_ctg(ctg_id).nodes():
- rv = reverse_end(v)
- p_path_rc_nodes.add( rv )
- if rv in sg2_nodes:
- sg2.remove_node(rv)
-
-
- h_tig_path = open(os.path.join(out_dir, "h_ctg_path.%s" % ctg_id),"w")
- h_tig_fa = open(os.path.join(out_dir, "h_ctg_all.%s.fa" % ctg_id),"w")
- edges_to_remove = set()
-
- labelled_node = set()
- with open(os.path.join(out_dir, "h_ctg_edges.%s" % ctg_id),"w") as f:
- h_tig_id = 1
- h_paths = {}
- #print "number of components:", len([tmp for tmp in nx.weakly_connected_component_subgraphs(sg2)])
- for sub_hg_0 in nx.weakly_connected_component_subgraphs(sg2):
- sub_hg = sub_hg_0.copy()
- while sub_hg.size() > 5:
- #print "sub_hg size:", len(sub_hg.nodes())
- sources = [n for n in sub_hg.nodes() if sub_hg.in_degree(n) != 1 ]
- sinks = [n for n in sub_hg.nodes() if sub_hg.out_degree(n) != 1 ]
-
-
- #print "number of sources", len(sources), sources
- #print "number of sinks", len(sinks), sinks
- if len(sources) == 0 and len(sinks) == 0: #TODO, the rest of the sub-graph are circles, we need to break and print warnning message
- break
-
- longest = []
-
- eliminated_sinks = set()
- s_longest = {}
- for s in sources:
- #print "test source",s, len(eliminated_sinks)
- if s in labelled_node:
- continue
- s_path = []
- for t in sinks:
- if t in eliminated_sinks:
- continue
- try:
- path = nx.shortest_path(sub_hg, s, t, weight="score")
- #print "test path len:", len(path), s, t
- except nx.exception.NetworkXNoPath:
- path = []
- continue
- s_path.append( [ path, t ] )
- s_path.sort(key = lambda x: -len(x[0]))
- if len(s_path) == 0:
- continue
- s_longest[s] = s_path[0][0]
- if len(s_longest[s]) > len(longest):
- longest = s_longest[s]
- #print "s longest", longest[0], longest[-1], len(longest)
- for path, t in s_path[1:]:
- eliminated_sinks.add(t)
- #print "elimated t", t
-
-
- if len(longest) == 0:
- break
-
- s = longest[0]
- t = longest[-1]
- h_paths[ ( s, t ) ] = longest
-
- labelled_node.add(s)
- rs = reverse_end(s)
- labelled_node.add(rs)
-
- for v in longest:
- sub_hg.remove_node(v)
-
- for s, t in h_paths:
- longest = h_paths[ (s, t) ]
- #print "number of node in path", s,t,len(longest)
- seq = []
- for v, w in zip(longest[:-1], longest[1:]):
- sg[v][w]["h_edge"] = 1
- if sg.edge[v][w]["src"] == "OP":
- edge_data = p_asm_G.sg_edges[ (v,w) ]
- else:
- edge_data = h_asm_G.sg_edges[ (v,w) ]
- vrid = v.split(":")[0]
- wrid = w.split(":")[0]
- vphase = arid_to_phase.get(vrid, (-1,0))
- wphase = arid_to_phase.get(wrid, (-1,0))
- print >>f, "%s_%03d" % (ctg_id, h_tig_id), v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
-
- if sg.edge[v][w]["src"] == "OP":
- edge_data = p_asm_G.sg_edges[ (v,w) ]
- else:
- edge_data = h_asm_G.sg_edges[ (v,w) ]
-
- seq_id, sp, tp = edge_data[0]
- if sp < tp:
- seq.append(seqs[ seq_id ][ sp:tp ])
- else:
- seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ sp:tp:-1 ] ]))
- print >> h_tig_path, "%s_%03d" % (ctg_id, h_tig_id), v, w, seq_id, sp, tp, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
- sg[v][w]["tig_id"] = "%s_%03d" % (ctg_id, h_tig_id)
-
- rv, rw = reverse_end(v), reverse_end(w)
- edges_to_remove.add( (v, w) )
- edges_to_remove.add( (rw, rv) )
-
- print >> h_tig_fa, ">%s_%03d" % (ctg_id, h_tig_id)
- print >> h_tig_fa, "".join(seq)
- h_tig_id += 1
-
-
- h_tig_fa.close()
- h_tig_path.close()
-
- dump_graph = False # the code segement below is useful for showing the graph
- if dump_graph == True:
- for v, w in sg.edges():
- if "h_edge" not in sg[v][w]:
- sg[v][w]["h_edge"] = 0
- if v in reachable_all:
- sg.node[v]["reachable"] = 1
- else:
- sg.node[v]["reachable"] = 0
- if w in reachable_all:
- sg.node[w]["reachable"] = 1
- else:
- sg.node[w]["reachable"] = 0
-
- nx.write_gexf(sg, "%s_0.gexf" % ctg_id)
-
-
-
-def parse_args(argv):
-
- parser = argparse.ArgumentParser(description='layout haplotigs from primary assembly graph and phased aseembly graph')
-
- parser.add_argument('--fc_asm_path', type=str, help='path to the primary Falcon assembly output directory', required=True)
- parser.add_argument('--fc_hasm_path', type=str, help='path to the phased Falcon assembly output directory', required=True)
- parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', default = "all", required=True)
- parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
- parser.add_argument('--rid_phase_map', type=str, help="path to the file that encode the relationship of the read id to phase blocks", required=True)
- parser.add_argument('--fasta', type=str, help="sequence file of the p-reads", required=True)
- args = parser.parse_args(argv[1:])
-
- return args
-
-def main(argv=sys.argv):
-
- # make life easier for now. will refactor it out if possible
- global all_rid_to_phase
- global p_asm_G
- global h_asm_G
- global all_rid_to_phase
- global seqs
-
- args = parse_args(argv)
- fc_asm_path = args.fc_asm_path
- fc_hasm_path = args.fc_hasm_path
- ctg_id = args.ctg_id
- base_dir = args.base_dir
- fasta_fn = args.fasta
-
- p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"),
- os.path.join(fc_asm_path, "utg_data"),
- os.path.join(fc_asm_path, "ctg_paths") )
-
- h_asm_G = AsmGraph( os.path.join(fc_hasm_path, "sg_edges_list"),
- os.path.join(fc_hasm_path, "utg_data"),
- os.path.join(fc_hasm_path, "ctg_paths") )
-
-
- all_rid_to_phase = {}
-
- all_read_ids = set()
- with open(args.rid_phase_map) as f:
- for row in f:
- row = row.strip().split()
- all_rid_to_phase.setdefault( row[1], {} )
- all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3]))
- all_read_ids.add(row[0])
-
- for v, w in p_asm_G.sg_edges:
- if p_asm_G.sg_edges[ (v, w) ][-1] != "G":
- continue
- v = v.split(":")[0]
- w = w.split(":")[0]
- all_read_ids.add(v)
- all_read_ids.add(w)
-
- for v, w in h_asm_G.sg_edges:
- if h_asm_G.sg_edges[ (v, w) ][-1] != "G":
- continue
- v = v.split(":")[0]
- w = w.split(":")[0]
- all_read_ids.add(v)
- all_read_ids.add(w)
-
- seqs = load_sg_seq(all_read_ids, fasta_fn)
-
- if ctg_id == "all":
- ctg_id_list = p_asm_G.ctg_data.keys()
- else:
- ctg_id_list = [ctg_id]
-
- exe_list = []
- for ctg_id in ctg_id_list:
- if ctg_id[-1] != "F":
- continue
- if ctg_id not in all_rid_to_phase:
- continue
- exe_list.append( (ctg_id, os.path.join(".", ctg_id)) )
-
- exec_pool = Pool(4) #TODO, make this configurable
- exec_pool.map( generate_haplotigs_for_ctg, exe_list)
- #map( generate_haplotigs_for_ctg, exe_list)
diff --git a/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py b/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
deleted file mode 100644
index ab92956..0000000
--- a/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
+++ /dev/null
@@ -1,354 +0,0 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials provided
-# with the distribution.
-#
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from multiprocessing import Pool
-import subprocess as sp
-import shlex
-import argparse
-import re
-import sys
-
-arid2phase = {}
-
-def filter_stage1(input_):
- db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len = input_
- try:
- ignore_rtn = []
- current_q_id = None
- ave_idt = 0.0
- all_over_len = 0.0
- overlap_data = {"5p":0, "3p":0}
- overlap_phase = {"5p":set(), "3p":set()}
- q_id = None
-
- for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) ) ).splitlines():
- l = l.strip().split()
- q_id, t_id = l[:2]
-
- if q_id not in arid2phase:
- continue
- if t_id not in arid2phase:
- continue
- if arid2phase[t_id][0] != arid2phase[q_id][0]:
- continue
-
- if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
- continue
-
- if q_id != None and q_id != current_q_id:
-
- left_count = overlap_data["5p"]
- right_count = overlap_data["3p"]
-
- if abs(left_count - right_count) > max_diff:
- ignore_rtn.append( current_q_id )
- elif left_count > max_ovlp or right_count > max_ovlp:
- ignore_rtn.append( current_q_id )
- elif left_count < min_ovlp or right_count < min_ovlp:
- ignore_rtn.append( current_q_id )
-
- #remove unphased reads if they are sandwiched by the reads from the same phase
- if current_q_id not in arid2phase:
- left_set = overlap_phase["5p"]
- right_set = overlap_phase["3p"]
- if len( left_set & right_set ) > 0:
- ignore_rtn.append( current_q_id )
-
- overlap_data = {"5p":0, "3p":0}
- overlap_phase = {"5p":set(), "3p":set()}
- current_q_id = q_id
- ave_idt = 0.0
- all_over_len = 0.0
-
- overlap_len = -int(l[2])
- idt = float(l[3])
- q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
- t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
-
- if idt < 90:
- continue
-
- if q_l < min_len or t_l < min_len:
- continue
-
- if l[-1] in ("contains", "overlap"):
- ave_idt += idt * overlap_len
- all_over_len += overlap_len
- if q_s == 0:
- overlap_data["5p"] += 1
- if t_id in arid2phase:
- overlap_phase["5p"].add( arid2phase[t_id] )
- if q_e == q_l:
- overlap_data["3p"] += 1
- if t_id in arid2phase:
- overlap_phase["3p"].add( arid2phase[t_id] )
-
- if q_id != None:
- left_count = overlap_data["5p"]
- right_count = overlap_data["3p"]
- if abs(left_count - right_count) > max_diff:
- ignore_rtn.append( current_q_id )
- elif left_count > max_ovlp or right_count > max_ovlp:
- ignore_rtn.append( current_q_id )
- elif left_count < min_ovlp or right_count < min_ovlp:
- ignore_rtn.append( current_q_id )
-
- #remove unphased reads if they are sandwiched by the reads from the same phase
- if current_q_id not in arid2phase:
- left_set = overlap_phase["5p"]
- right_set = overlap_phase["3p"]
- if len( left_set & right_set ) > 0:
- ignore_rtn.append( current_q_id )
-
-
- return fn, ignore_rtn
-
- except (KeyboardInterrupt, SystemExit):
- return
-
-def filter_stage2(input_):
- db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set = input_
- try:
- contained_id = set()
- for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn))).splitlines():
- l = l.strip().split()
- q_id, t_id = l[:2]
-
-
- if q_id not in arid2phase:
- continue
- if t_id not in arid2phase:
- continue
-
- if arid2phase[t_id][0] != arid2phase[q_id][0]:
- continue
- if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
- continue
-
-
- q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
- t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
-
- idt = float(l[3])
- if idt < 90:
- continue
-
- if q_l < min_len or t_l < min_len:
- continue
-
- if q_id in ignore_set:
- continue
- if t_id in ignore_set:
- continue
- if l[-1] == "contained":
- contained_id.add(q_id)
- if l[-1] == "contains":
- contained_id.add(t_id)
- return fn, contained_id
-
- except (KeyboardInterrupt, SystemExit):
- return
-
-def filter_stage3(input_):
-
- db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn = input_
-
- overlap_data = {"5p":[], "3p":[]}
- try:
- ovlp_output = []
- current_q_id = None
- for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) )).splitlines():
- l = l.strip().split()
- q_id, t_id = l[:2]
-
-
- if q_id not in arid2phase:
- continue
- if t_id not in arid2phase:
- continue
-
- if arid2phase[t_id][0] != arid2phase[q_id][0]:
- continue
- if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
- continue
-
- if current_q_id == None:
- current_q_id = q_id
- overlap_data = {"5p":[], "3p":[]}
-
- elif q_id != current_q_id:
- left = overlap_data["5p"]
- right = overlap_data["3p"]
- left.sort()
- right.sort()
-
- for i in xrange(len(left)):
- inphase, score, m_range, ovlp = left[i]
- ovlp_output.append(ovlp)
- #print " ".join(ovlp), read_end_data[current_q_id]
- if i >= bestn and m_range > 1000:
- break
-
- for i in xrange(len(right)):
- inphase, score, m_range, ovlp = right[i]
- ovlp_output.append(ovlp)
- #print " ".join(ovlp), read_end_data[current_q_id]
- if i >= bestn and m_range > 1000:
- break
-
- overlap_data = {"5p":[], "3p":[]}
- current_q_id = q_id
-
- if q_id in contained_set:
- continue
- if t_id in contained_set:
- continue
- if q_id in ignore_set:
- continue
- if t_id in ignore_set:
- continue
-
- overlap_len = -int(l[2])
- idt = float(l[3])
- q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
- t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
-
- if idt < 90:
- continue
- if q_l < min_len or t_l < min_len:
- continue
- if q_s == 0:
- l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
- inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
- #nphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
- overlap_data["5p"].append( (-inphase, -overlap_len, t_l - (t_e - t_s), l ) )
- elif q_e == q_l:
- l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
- inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
- #inphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
- overlap_data["3p"].append( (-inphase, -overlap_len, t_l - (t_e - t_s), l ) )
-
- left = overlap_data["5p"]
- right = overlap_data["3p"]
- left.sort()
- right.sort()
-
-
- for i in xrange(len(left)):
- inphase, score, m_range, ovlp = left[i]
- ovlp_output.append(ovlp)
- #print " ".join(ovlp), read_end_data[current_q_id]
- if i >= bestn and m_range > 1000:
- break
-
- for i in xrange(len(right)):
- inphase, score, m_range, ovlp = right[i]
- ovlp_output.append(ovlp)
- #print " ".join(ovlp), read_end_data[current_q_id]
- if i >= bestn and m_range > 1000:
- break
-
- return fn, ovlp_output
- except (KeyboardInterrupt, SystemExit):
- return
-
-
-def parse_args(argv):
-
- parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
- parser.add_argument('--n_core', type=int, default=4,
- help='number of processes used for generating consensus')
- parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
- parser.add_argument('--db', type=str, help='read db file path')
- parser.add_argument('--max_diff', type=int, help="max difference of 5' and 3' coverage")
- parser.add_argument('--max_cov', type=int, help="max coverage of 5' or 3' coverage")
- parser.add_argument('--min_cov', type=int, help="min coverage of 5' or 3' coverage")
- parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
- parser.add_argument('--bestn', type=int, default=10, help="output at least best n overlaps on 5' or 3' ends if possible")
- parser.add_argument('--rid_phase_map', type=str, help="the file that encode the relationship of the read id to phase blocks", required=True)
- args = parser.parse_args(argv[1:])
-
- return args
-
-def main(argv=sys.argv):
- args = parse_args(argv)
-
- max_diff = args.max_diff
- max_cov = args.max_cov
- min_cov = args.min_cov
- min_len = args.min_len
- bestn = args.bestn
- db_fn = args.db
-
- with open(args.rid_phase_map) as f:
- for row in f:
- row = row.strip().split()
- arid2phase[row[0]] = (row[1], row[2], row[3]) #ctg_id, phase_blk_id, phase_id
-
- exe_pool = Pool(args.n_core)
-
- file_list = open(args.fofn).read().split("\n")
- inputs = []
- for fn in file_list:
- if len(fn) != 0:
- inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len) )
-
- ignore_all = []
- for res in exe_pool.imap(filter_stage1, inputs):
- ignore_all.extend( res[1] )
-
- inputs = []
- ignore_all = set(ignore_all)
- for fn in file_list:
- if len(fn) != 0:
- inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all) )
- contained = set()
- for res in exe_pool.imap(filter_stage2, inputs):
- contained.update(res[1])
- #print res[0], len(res[1]), len(contained)
-
- #print "all", len(contained)
- inputs = []
- ignore_all = set(ignore_all)
- for fn in file_list:
- if len(fn) != 0:
- inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn) )
- for res in exe_pool.imap(filter_stage3, inputs):
- for l in res[1]:
- print " ".join(l)
diff --git a/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py b/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
deleted file mode 100644
index 772f011..0000000
--- a/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
+++ /dev/null
@@ -1,1545 +0,0 @@
-import networkx as nx
-import os
-import shlex
-import sys
-import subprocess
-import argparse
-
-DEBUG_LOG_LEVEL = 0
-
-class SGNode(object):
- """
- class representing a node in the string graph
- """
- def __init__(self, node_name):
- self.name = node_name
- self.out_edges = []
- self.in_edges = []
-
- def add_out_edge(self, out_edge):
- self.out_edges.append(out_edge)
-
- def del_out_edge(self, out_edge):
- self.out_edges.remove(out_edge)
-
- def add_in_edge(self, in_edge):
- self.in_edges.append(in_edge)
-
- def del_in_edge(self, in_edge):
- self.in_edges.remove(in_edge)
-
-class SGEdge(object):
- """
- class representing an edge in the string graph
- """
- def __init__(self, in_node, out_node):
- self.in_node = in_node
- self.out_node = out_node
- self.attr = {}
- def set_attribute(self, attr, value):
- self.attr[attr] = value
-
-def reverse_end( node_id ):
- node_id, end = node_id.split(":")
- new_end = "B" if end == "E" else "E"
- return node_id + ":" + new_end
-
-class StringGraph(object):
- """
- class representing the string graph
- """
- def __init__(self):
- self.nodes = {}
- self.edges = {}
- self.n_mark = {}
- self.e_reduce = {}
- self.repeat_overlap = {}
-
- def add_node(self, node_name):
- """
- add a node into the graph by given a node name
- """
- if node_name not in self.nodes:
- self.nodes[node_name] = SGNode(node_name)
-
- def add_edge(self, in_node_name, out_node_name, **attributes):
- """
- add an edge into the graph by given a pair of nodes
- """
- if (in_node_name, out_node_name) not in self.edges:
-
- self.add_node(in_node_name)
- self.add_node(out_node_name)
- in_node = self.nodes[in_node_name]
- out_node = self.nodes[out_node_name]
-
- edge = SGEdge(in_node, out_node)
- self.edges[ (in_node_name, out_node_name) ] = edge
- in_node.add_out_edge(edge)
- out_node.add_in_edge(edge)
- edge = self.edges[ (in_node_name, out_node_name) ]
- for k, v in attributes.items():
- edge.attr[k] = v
-
- def remove_cross_phase_edges(self):
-
- to_remove = {}
-
- for k,e in self.edges.items():
- if e.attr["phase"][0] != e.attr["phase"][1]:
- to_remove[k] = e
-
- for k, e in to_remove.items():
- in_node_name, out_node_name = k
- in_node = self.nodes[in_node_name]
- out_node = self.nodes[out_node_name]
- in_node.del_out_edge(e)
- out_node.del_in_edge(e)
- del self.edges[k]
-
- return to_remove
-
- def init_reduce_dict(self):
- for e in self.edges:
- self.e_reduce[e] = False
-
- def bfs_nodes(self, n, exclude = None, depth=5):
- all_nodes = set()
- all_nodes.add(n)
- candidate_nodes = set()
- candidate_nodes.add(n)
- dp = 1
- while dp < depth and len(candidate_nodes) > 0:
- v = candidate_nodes.pop()
- for e in v.out_edges :
- w = e.out_node
- if w == exclude:
- continue
- if w not in all_nodes:
- all_nodes.add(w)
- if len(w.out_edges) > 0:
- candidate_nodes.add(w)
- dp += 1
-
- return all_nodes
-
-
- def mark_chimer_edges(self):
-
- multi_in_nodes = {}
- multi_out_nodes = {}
- for n_name in self.nodes:
- n = self.nodes[n_name]
- out_nodes = [ e.out_node for e in n.out_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False ]
- in_nodes = [e.in_node for e in n.in_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False]
-
- if len(out_nodes) >= 2:
- multi_out_nodes[n_name] = out_nodes
- if len(in_nodes) >= 2:
- multi_in_nodes[n_name] = in_nodes
-
- chimer_candidates = set()
- out_set = set()
- in_set = set()
- for n_name in multi_out_nodes:
- out_nodes = set(multi_out_nodes[n_name])
- out_set |= out_nodes
-
- for n_name in multi_in_nodes:
- in_nodes = set(multi_in_nodes[n_name])
- in_set |= in_nodes
-
- chimer_candidates = out_set & in_set
-
- chimer_nodes = []
- chimer_edges = set()
- for n in chimer_candidates:
- out_nodes = set( [ e.out_node for e in n.out_edges ] )
- test_set = set()
- for in_node in [e.in_node for e in n.in_edges ]:
- test_set = test_set | set( [ e.out_node for e in in_node.out_edges ] )
- test_set -= set([n])
- if len( out_nodes & test_set ) == 0:
- flow_node1 = set()
- flow_node2 = set()
- for v in list(out_nodes):
- flow_node1 |= self.bfs_nodes(v, exclude=n)
- for v in list(test_set):
- flow_node2 |= self.bfs_nodes(v, exclude=n)
- if len( flow_node1 & flow_node2 ) == 0:
- for e in n.out_edges:
- v, w = e.in_node.name, e.out_node.name
- if self.e_reduce[ (v, w) ] != True:
- self.e_reduce[ (v, w) ] = True
- chimer_edges.add( (v, w) )
- rv = reverse_end(w)
- rw = reverse_end(v)
- self.e_reduce[ (rv, rw) ] = True
- chimer_edges.add( (rv, rw) )
-
- for e in n.in_edges:
- v, w = e.in_node.name, e.out_node.name
- if self.e_reduce[ (v, w) ] != True:
- self.e_reduce[ (v, w) ] = True
- chimer_edges.add( (v, w) )
- rv = reverse_end(w)
- rw = reverse_end(v)
- self.e_reduce[ (rv, rw) ] = True
- chimer_edges.add( (rv, rw) )
- chimer_nodes.append( n.name )
- chimer_nodes.append( reverse_end(n.name) )
-
- return chimer_nodes, chimer_edges
-
- def mark_spur_edge(self):
-
- removed_edges = set()
- for v in self.nodes:
- if len(self.nodes[v].out_edges) > 1:
- for out_edge in self.nodes[v].out_edges:
- w = out_edge.out_node.name
-
- if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
- self.e_reduce[(v, w)] = True
- removed_edges.add( (v, w) )
- v2, w2 = reverse_end(w), reverse_end(v)
- self.e_reduce[(v2, w2)] = True
- removed_edges.add( (v2, w2) )
-
- if len(self.nodes[v].in_edges) > 1:
- for in_edge in self.nodes[v].in_edges:
- w = in_edge.in_node.name
- if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
- self.e_reduce[(w, v)] = True
- removed_edges.add( (w, v) )
- v2, w2 = reverse_end(w), reverse_end(v)
- self.e_reduce[(w2, v2)] = True
- removed_edges.add( (w2, v2) )
- return removed_edges
-
- def mark_tr_edges(self):
- """
- transitive reduction
- """
- n_mark = self.n_mark
- e_reduce = self.e_reduce
- FUZZ = 500
- for n in self.nodes:
- n_mark[n] = "vacant"
-
- for n_name, node in self.nodes.items():
-
- out_edges = node.out_edges
- if len(out_edges) == 0:
- continue
-
- out_edges.sort(key=lambda x: x.attr["length"])
-
- for e in out_edges:
- w = e.out_node
- n_mark[ w.name ] = "inplay"
-
- max_len = out_edges[-1].attr["length"]
-
- max_len += FUZZ
-
- for e in out_edges:
- e_len = e.attr["length"]
- w = e.out_node
- if n_mark[w.name] == "inplay":
- w.out_edges.sort( key=lambda x: x.attr["length"] )
- for e2 in w.out_edges:
- if e2.attr["length"] + e_len < max_len:
- x = e2.out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
-
- for e in out_edges:
- e_len = e.attr["length"]
- w = e.out_node
- w.out_edges.sort( key=lambda x: x.attr["length"] )
- if len(w.out_edges) > 0:
- x = w.out_edges[0].out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
- for e2 in w.out_edges:
- if e2.attr["length"] < FUZZ:
- x = e2.out_node
- if n_mark[x.name] == "inplay":
- n_mark[x.name] = "eliminated"
-
- for out_edge in out_edges:
- v = out_edge.in_node
- w = out_edge.out_node
- if n_mark[w.name] == "eliminated":
- e_reduce[ (v.name, w.name) ] = True
- v_name, w_name = reverse_end(w.name), reverse_end(v.name)
- e_reduce[(v_name, w_name)] = True
- n_mark[w.name] = "vacant"
-
-
- def mark_best_overlap(self):
- """
- find the best overlapped edges
- """
-
- best_edges = set()
- removed_edges = set()
-
- for v in self.nodes:
-
- out_edges = self.nodes[v].out_edges
- if len(out_edges) > 0:
- out_edges.sort(key=lambda e: -e.attr["score"])
- for e in out_edges:
- if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
- best_edges.add( (e.in_node.name, e.out_node.name) )
- break
-
- in_edges = self.nodes[v].in_edges
- if len(in_edges) > 0:
- in_edges.sort(key=lambda e: -e.attr["score"])
- for e in in_edges:
- if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
- best_edges.add( (e.in_node.name, e.out_node.name) )
- break
-
- if DEBUG_LOG_LEVEL > 1:
- print "X", len(best_edges)
-
- for e_n, e in self.edges.items():
- v = e_n[0]
- w = e_n[1]
- if self.e_reduce[ (v, w) ] != True:
- if (v, w) not in best_edges:
- self.e_reduce[(v, w)] = True
- removed_edges.add( (v, w) )
- v2, w2 = reverse_end(w), reverse_end(v)
- self.e_reduce[(v2, w2)] = True
- removed_edges.add( (v2, w2) )
-
- return removed_edges
-
- def resolve_repeat_edges(self):
-
-
- edges_to_reduce = []
- nodes_to_test = set()
- for v_n, v in self.nodes.items():
-
- out_nodes = []
- for e in v.out_edges:
- if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
- out_nodes.append( e.out_node.name )
-
- in_nodes = []
- for e in v.in_edges:
- if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
- in_nodes.append( e.in_node.name )
-
- if len(out_nodes) == 1 and len(in_nodes) == 1:
- nodes_to_test.add(v_n)
-
- for v_n in list( nodes_to_test ):
-
- v = self.nodes[v_n]
-
- out_nodes = []
- for e in v.out_edges:
- if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
- out_nodes.append( e.out_node.name )
-
- in_nodes = []
- for e in v.in_edges:
- if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
- in_nodes.append( e.in_node.name )
-
- in_node_name = in_nodes[0]
-
- for out_edge in self.nodes[in_node_name].out_edges:
- vv = out_edge.in_node.name
- ww = out_edge.out_node.name
-
- ww_out = self.nodes[ww].out_edges
- v_out = self.nodes[v_n].out_edges
- ww_out_nodes = set( [ n.out_node.name for n in ww_out] )
- v_out_nodes = set( [ n.out_node.name for n in v_out] )
- o_overlap = len( ww_out_nodes & v_out_nodes )
-
- ww_in_count = 0
- for e in self.nodes[ww].in_edges:
- if self.e_reduce[ ( e.in_node.name, e.out_node.name ) ] == False:
- ww_in_count += 1
-
- if ww != v_n and\
- self.e_reduce[ (vv, ww) ] == False and\
- ww_in_count > 1 and\
- ww not in nodes_to_test and\
- o_overlap == 0:
- edges_to_reduce.append( (vv, ww) )
-
- out_node_name = out_nodes[0]
-
- for in_edge in self.nodes[out_node_name].in_edges:
- vv = in_edge.in_node.name
- ww = in_edge.out_node.name
-
- vv_in = self.nodes[vv].in_edges
- v_in = self.nodes[v_n].in_edges
- vv_in_nodes = set( [ n.in_node.name for n in vv_in] )
- v_in_nodes = set( [ n.in_node.name for n in v_in] )
- i_overlap = len( vv_in_nodes & v_in_nodes )
-
- vv_out_count = 0
- for e in self.nodes[vv].out_edges:
- if self.e_reduce[ ( e.in_node.name, e.out_node.name )] == False:
- vv_out_count += 1
-
- if vv != v_n and\
- self.e_reduce[ (vv, ww) ] == False and\
- vv_out_count > 1 and\
- vv not in nodes_to_test and\
- i_overlap == 0:
- edges_to_reduce.append( (vv, ww) )
-
- removed_edges = set()
- for e in edges_to_reduce:
- self.e_reduce[e] = True
- removed_edges.add(e)
-
- return removed_edges
-
- def get_out_edges_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].out_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- return rtn
-
-
- def get_in_edges_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].in_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- return rtn
-
- def get_best_out_edge_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].out_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- rtn.sort(key=lambda e: e.attr["score"])
-
- return rtn[-1]
-
- def get_best_in_edge_for_node(self, name, mask=True):
- rtn = []
- for e in self.nodes[name].in_edges:
- v = e.in_node
- w = e.out_node
- if self.e_reduce[ (v.name, w.name) ] == False:
- rtn.append(e)
- rtn.sort(key=lambda e: e.attr["score"])
- return rtn[-1]
-
-
-RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-def generate_seq_from_path(sg, seqs, path):
- subseqs = []
- r_id, end = path[0].split(":")
-
- count = 0
- for i in range( len( path ) -1 ):
- w_n, v_n = path[i:i+2]
- edge = sg.edges[ (w_n, v_n ) ]
- read_id, coor = edge.attr["label"].split(":")
- b,e = coor.split("-")
- b = int(b)
- e = int(e)
- if b < e:
- subseqs.append( seqs[read_id][b:e] )
- else:
- subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
-
- return "".join(subseqs)
-
-
-def reverse_edge( e ):
- e1, e2 = e
- return reverse_end(e2), reverse_end(e1)
-
-def reverse_path( p ):
- p = p[::-1]
- return [reverse_end(n) for n in p]
-
-
-def find_bundle(ug, u_edge_data, start_node, depth_cutoff, width_cutoff, length_cutoff):
-
- tips = set()
- bundle_edges = set()
- bundle_nodes = set()
-
- local_graph = nx.ego_graph(ug, start_node, depth_cutoff, undirected=False)
- length_to_node = {start_node:0}
- score_to_node = {start_node:0}
-
- v = start_node
- end_node = start_node
-
- if DEBUG_LOG_LEVEL > 1:
- print
- print
- print "start", start_node
-
- bundle_nodes.add(v)
- for vv, ww, kk in local_graph.out_edges(v, keys = True):
- max_score = 0
- max_length = 0
-
- if (vv, ww, kk) not in bundle_edges and\
- reverse_end(ww) not in bundle_nodes:
-
- bundle_edges.add( (vv, ww, kk) )
- tips.add(ww)
-
- for v in list(tips):
- bundle_nodes.add(v)
-
- depth = 1
- width = 1.0
- converage = False
-
-
- while 1:
- if DEBUG_LOG_LEVEL > 1:
- print "# of tips", len(tips)
-
- if len(tips) > 4:
- converage = False
- break
-
- if len(tips) == 1:
- end_node = tips.pop()
-
- if DEBUG_LOG_LEVEL > 1:
- print "end", end_node
-
- if end_node not in length_to_node:
- v = end_node
- max_score_edge = None
- max_score = 0
- for uu, vv, kk in local_graph.in_edges(v, keys=True):
- if uu not in length_to_node:
- continue
-
- score = u_edge_data[ (uu, vv, kk) ][1]
-
- if score > max_score:
-
- max_score = score
- max_score_edge = (uu, vv, kk)
-
- length_to_node[v] = length_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][0]
- score_to_node[v] = score_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][1]
-
-
- converage = True
- break
-
-
- depth += 1
- width = 1.0 * len(bundle_edges) / depth
-
- if depth > 10 and width > width_cutoff:
- converage = False
- break
-
- if depth > depth_cutoff:
- converage = False
- break
-
- tips_list = list(tips)
-
- tip_updated = False
- loop_detect = False
- length_limit_reached = False
-
- for v in tips_list:
- if DEBUG_LOG_LEVEL > 1:
- print "process", v
-
- if len(local_graph.out_edges(v, keys=True)) == 0: # dead end route
- print "no out edge", v
- continue
-
- max_score_edge = None
- max_score = 0
-
- extend_tip = True
-
- for uu, vv, kk in local_graph.in_edges(v, keys=True):
- if DEBUG_LOG_LEVEL > 1:
- print "in_edges", uu, vv, kk
- print uu, "in length_to_node", uu in length_to_node
-
- if uu not in length_to_node:
- extend_tip = False
- break
-
- score = u_edge_data[ (uu, vv, kk) ][1]
-
- if score > max_score:
-
- max_score = score
- max_score_edge = (uu, vv, kk)
-
- if extend_tip:
-
- length_to_node[v] = length_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][0]
- score_to_node[v] = score_to_node[max_score_edge[0]] + u_edge_data[ max_score_edge ][1]
-
- if length_to_node[v] > length_cutoff:
- length_limit_reached = True
- converage = False
- break
-
- v_updated = False
- for vv, ww, kk in local_graph.out_edges(v, keys=True):
-
- if DEBUG_LOG_LEVEL > 1:
- print "test", vv, ww, kk
-
- if ww in length_to_node:
- loop_detect = True
- if DEBUG_LOG_LEVEL > 1:
- print "loop_detect", ww
- break
-
- if (vv, ww, kk) not in bundle_edges and\
- reverse_end(ww) not in bundle_nodes:
-
- if DEBUG_LOG_LEVEL > 1:
- print "add", ww
-
- tips.add(ww)
- bundle_edges.add( (vv, ww, kk) )
- tip_updated = True
- v_updated = True
-
- if v_updated:
-
- if DEBUG_LOG_LEVEL > 1:
- print "remove", v
-
- tips.remove(v)
-
- if len(tips) == 1:
- break
-
- if loop_detect:
- converage = False
- break
-
- if length_limit_reached:
- converage = False
- break
-
- if loop_detect:
- converage = False
- break
-
- if not tip_updated:
- converage = False
- break
-
- for v in list(tips):
- bundle_nodes.add(v)
-
-
-
- data = start_node, end_node, bundle_edges, length_to_node[end_node], score_to_node[end_node], depth
-
- data_r = None
-
- if DEBUG_LOG_LEVEL > 1:
- print converage, data, data_r
- return converage, data, data_r
-
-def generate_string_graph(args):
-
- overlap_file = args.overlap_file
-
- contained_reads = set()
- chimer_ids = set()
-
- filter_reads = False
-
- seqs = set()
-
- G=nx.Graph()
- edges =set()
- overlap_data = []
- contained_reads = set()
- overlap_count = {}
-
-
- # loop through the overlapping data to load the data in the a python array
- # contained reads are identified
-
- with open(overlap_file) as f:
- for l in f:
- l = l.strip().split()
-
-
- #work around for some ill formed data recored
- #if len(l) != 13:
- # continue
-
- f_id, g_id, score, identity = l[:4]
-
- if f_id == g_id: # don't need self-self overlapping
- continue
-
- if filter_reads:
-
- if g_id not in seqs:
- continue
-
- if f_id not in seqs:
- continue
-
- phase1 = l[13]
- phase2 = l[14]
-
- score = int(score)
- identity = float(identity)
- contained = l[12]
- if contained == "contained" and phase1 == phase2:
- contained_reads.add(f_id)
- continue
- if contained == "contains" and phase1 == phase2:
- contained_reads.add(g_id)
- continue
- if contained == "none":
- continue
-
- if identity < args.min_idt: # only take record with >96% identity as overlapped reads
- continue
- f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
- g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
-
- # only used reads longer than the 4kb for assembly
- if f_len < args.min_len: continue
- if g_len < args.min_len: continue
-
- """
- # double check for proper overlap
- # this is not necessary when using DALIGNER for overlapper
- # it may be useful if other overlappers give fuzzier alignment boundary
- if f_start > 24 and f_len - f_end > 24: # allow 24 base tolerance on both sides of the overlapping
- continue
-
- if g_start > 24 and g_len - g_end > 24:
- continue
-
- if g_strain == 0:
- if f_start < 24 and g_len - g_end > 24:
- continue
- if g_start < 24 and f_len - f_end > 24:
- continue
- else:
- if f_start < 24 and g_start > 24:
- continue
- if g_start < 24 and f_start > 24:
- continue
- """
-
- overlap_data.append( (f_id, g_id, score, identity,
- f_strain, f_start, f_end, f_len,
- g_strain, g_start, g_end, g_len,
- phase1, phase2) )
-
- overlap_count[f_id] = overlap_count.get(f_id,0)+1
- overlap_count[g_id] = overlap_count.get(g_id,0)+1
-
- overlap_set = set()
- sg = StringGraph()
- for od in overlap_data:
- f_id, g_id, score, identity = od[:4]
- if f_id in contained_reads:
- continue
- if g_id in contained_reads:
- continue
- f_s, f_b, f_e, f_l = od[4:8]
- g_s, g_b, g_e, g_l = od[8:12]
-
- phase1 = od[12]
- phase2 = od[13]
-
- overlap_pair = [f_id, g_id]
- overlap_pair.sort()
- overlap_pair = tuple( overlap_pair )
- if overlap_pair in overlap_set: # don't allow duplicated records
- continue
- else:
- overlap_set.add(overlap_pair)
-
-
- if g_s == 1: # revered alignment, swapping the begin and end coordinates
- g_b, g_e = g_e, g_b
-
- # build the string graph edges for each overlap
- if f_b > 1:
- if g_b < g_e:
- """
- f.B f.E
- f ----------->
- g ------------->
- g.B g.E
- """
- if f_b == 0 or g_e - g_l == 0:
- continue
- sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
- length = abs(f_b-0),
- score = -score,
- identity = identity,
- phase = (phase2, phase1))
- sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = (g_id, g_e, g_l),
- length = abs(g_e-g_l),
- score = -score,
- identity = identity,
- phase = (phase1, phase2))
- else:
- """
- f.B f.E
- f ----------->
- g <-------------
- g.E g.B
- """
- if f_b == 0 or g_e == 0:
- continue
- sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
- length = abs(f_b -0),
- score = -score,
- identity = identity,
- phase = (phase2, phase1))
- sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = (g_id, g_e, 0),
- length = abs(g_e- 0),
- score = -score,
- identity = identity,
- phase = (phase1, phase2))
- else:
- if g_b < g_e:
- """
- f.B f.E
- f ----------->
- g ------------->
- g.B g.E
- """
- if g_b == 0 or f_e - f_l == 0:
- continue
- sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = (g_id, g_b, 0),
- length = abs(g_b - 0),
- score = -score,
- identity = identity,
- phase = (phase1, phase2))
- sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
- length = abs(f_e-f_l),
- score = -score,
- identity = identity,
- phase = (phase2, phase1))
- else:
- """
- f.B f.E
- f ----------->
- g <-------------
- g.E g.B
- """
- if g_b - g_l == 0 or f_e - f_l ==0:
- continue
- sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = (g_id, g_b, g_l),
- length = abs(g_b - g_l),
- score = -score,
- identity = identity,
- phase = (phase1, phase2))
- sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
- length = abs(f_e - f_l),
- score = -score,
- identity = identity,
- phase=(phase2, phase1))
-
-
- cp_edges = sg.remove_cross_phase_edges()
- sg.init_reduce_dict()
-
- #if not args.disable_chimer_prediction:
- # sg.mark_chimer_edges()
- #sg.mark_spur_edge()
-
-
- sg.mark_tr_edges() # mark those edges that transitive redundant
-
- if DEBUG_LOG_LEVEL > 1:
- print sum( [1 for c in sg.e_reduce.values() if c == True] )
- print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
- chimer_nodes, chimer_edges = sg.mark_chimer_edges()
-
- removed_edges = set()
- if args.lfc == True:
- removed_edges = sg.resolve_repeat_edges()
- else:
- removed_edges = sg.mark_best_overlap() # mark those edges that are best overlap edges
-
- spur_edges = sg.mark_spur_edge()
-
- if DEBUG_LOG_LEVEL > 1:
- print sum( [1 for c in sg.e_reduce.values() if c == False] )
-
- #max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
- out_f = open("sg_edges_list", "w")
- nxsg = nx.DiGraph()
- edge_data = {}
- for v, w in sg.edges:
- e = sg.edges[ (v, w) ]
- rid, sp, tp = e.attr["label"]
- score = e.attr["score"]
- identity = e.attr["identity"]
- length = abs(sp-tp)
-
-
- if sg.e_reduce[(v, w)] != True:
- type_ = "G"
- label = "%s:%d-%d" % (rid, sp, tp)
- nxsg.add_edge(v, w, label = label, length = length, score = score)
- edge_data[ (v, w) ] = (rid, sp, tp, length, score, identity, type_)
- elif (v, w) in chimer_edges:
- type_ = "C"
- elif (v, w) in removed_edges:
- type_ = "R"
- elif (v, w) in spur_edges:
- type_ = "S"
- elif sg.e_reduce[(v, w)] == True:
- type_ = "TR"
-
- print >>out_f, v, w, rid, sp, tp, score, identity, type_
-
- for k,e in cp_edges.items():
- v = k[0]
- w = k[1]
- rid, sp, tp = e.attr["label"]
- score = e.attr["score"]
- identity = e.attr["identity"]
- length = abs(sp-tp)
- print >>out_f, v, w, rid, sp, tp, score, identity, "CP"
-
-
- out_f.close()
- nxsg_r = nxsg.reverse()
-
- return nxsg, nxsg_r, edge_data
-
-
-
-def construct_compound_paths(ug, u_edge_data):
-
- source_nodes = set()
- sink_nodes = set()
- simple_nodes = set()
- branch_nodes = set()
-
- all_nodes = ug.nodes()
- for n in all_nodes:
- in_degree = len( ug.in_edges(n) )
- out_degree = len( ug.out_edges(n) )
- if in_degree == 0:
- source_nodes.add(n)
- if out_degree == 0:
- sink_nodes.add(n)
- if in_degree == 1 and out_degree == 1:
- simple_nodes.add(n)
- if in_degree > 1 or out_degree > 1:
- branch_nodes.add(n)
-
- #print "#", len(all_nodes),len(source_nodes), len(sink_nodes), len(simple_nodes), len(branch_nodes)
- compound_paths_0 = []
- for p in list(branch_nodes):
- if ug.out_degree(p) > 1:
- coverage, data, data_r = find_bundle(ug, u_edge_data, p, 48, 16, 500000)
- if coverage == True:
- start_node, end_node, bundle_edges, length, score, depth = data
- compound_paths_0.append( (start_node, "NA", end_node, 1.0*len(bundle_edges)/depth, length, score, bundle_edges ) )
-
- compound_paths_0.sort( key=lambda x: -len(x[6]) )
-
-
- edge_to_cpath = {}
- compound_paths_1 = {}
- for s, v, t, width, length, score, bundle_edges in compound_paths_0:
- if DEBUG_LOG_LEVEL > 1:
- print "constructing utg, test ", s,v, t
-
- overlapped = False
- for vv, ww, kk in list(bundle_edges):
- if (vv, ww, kk) in edge_to_cpath:
- if DEBUG_LOG_LEVEL > 1:
- print "remove overlapped utg", (s, v, t), (vv, ww, kk)
- overlapped = True
- break
- rvv = reverse_end(vv)
- rww = reverse_end(ww)
- rkk = reverse_end(kk)
- if (rww, rvv, rkk) in edge_to_cpath:
- if DEBUG_LOG_LEVEL > 1:
- print "remove overlapped r utg", (s, v, t), (rww, rvv, rkk)
- overlapped = True
- break
-
-
- if not overlapped:
- if DEBUG_LOG_LEVEL > 1:
- print "constructing", s,v, t
-
- bundle_edges_r = []
- rs = reverse_end(t)
- rt = reverse_end(s)
-
- for vv, ww, kk in list(bundle_edges):
- edge_to_cpath.setdefault( (vv, ww, kk), set() )
- edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
- rvv = reverse_end(ww)
- rww = reverse_end(vv)
- rkk = reverse_end(kk)
- edge_to_cpath.setdefault( (rvv, rww, rkk), set() )
- edge_to_cpath[ (rvv, rww, rkk) ].add( (rs, rt, v) ) #assert v == "NA"
- bundle_edges_r.append( (rvv, rww, rkk) )
-
- compound_paths_1[ ( s, v, t) ] = width, length, score, bundle_edges
- compound_paths_1[ ( rs, v, rt) ] = width, length, score, bundle_edges_r
-
-
- compound_paths_2 = {}
- edge_to_cpath = {}
- for s, v, t in compound_paths_1:
- rs = reverse_end(t)
- rt = reverse_end(s)
- if (rs, "NA", rt) not in compound_paths_1:
- if DEBUG_LOG_LEVEL > 1:
- print "non_compliment bundle", s, v, t, len(compound_paths_1[( s, v, t)][-1])
- continue
- width, length, score, bundle_edges = compound_paths_1[ (s, v, t) ]
- compound_paths_2[ (s, v, t) ] = width, length, score, bundle_edges
- for vv, ww, kk in list(bundle_edges):
- edge_to_cpath.setdefault( (vv, ww, kk), set() )
- edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
-
-
- compound_paths_3 = {}
- for k, val in compound_paths_2.items():
-
- start_node, NA, end_node = k
- rs = reverse_end(end_node)
- rt = reverse_end(start_node)
- assert (rs, "NA", rt) in compound_paths_2
-
- contained = False
- for vv, ww, kk in ug.out_edges(start_node, keys=True):
- if len(edge_to_cpath.get( (vv, ww, kk), [] )) > 1:
- contained = True
-
- if not contained:
- compound_paths_3[k] = val
- if DEBUG_LOG_LEVEL > 1:
- print "compound", k
-
- compound_paths = {}
- for s, v, t in compound_paths_3:
- rs = reverse_end(t)
- rt = reverse_end(s)
- if (rs, "NA", rt) not in compound_paths_3:
- continue
- compound_paths[ (s, v, t) ] = compound_paths_3[ (s, v, t) ]
-
- return compound_paths
-
-def parse_args(argv):
-
- parser = argparse.ArgumentParser(description='a string graph assembler that is desinged for handling diploid genomes')
- parser.add_argument('overlap_file', help='a file that contains the overlap information.')
- parser.add_argument('--min_len', type=int, default=4000,
- help='minimum length of the reads to be considered for assembling')
- parser.add_argument('--min_idt', type=float, default=96,
- help='minimum alignment identity of the reads to be considered for assembling')
- parser.add_argument('--lfc', action="store_true", default=False,
- help='use local flow constraint method rather than best overlap method to resolve knots in string graph')
- args = parser.parse_args(argv[1:])
- return args
-
-def main(argv=sys.argv):
- args = parse_args(argv)
-
- # transitivity reduction, remove spurs, remove putative edges caused by repeats
- sg, sg_r, edge_data = generate_string_graph(args)
-
-
- simple_paths = {}
- dual_path = {}
-
-
- sg2 = nx.DiGraph()
-
- for v, w in edge_data:
-
- assert (reverse_end(w), reverse_end(v)) in edge_data
-
- #if (v, w) in masked_edges:
- # continue
-
- rid, sp, tp, length, score, identity, type_ = edge_data[ (v, w) ]
- if type_ != "G":
- continue
-
- label = "%s:%d-%d" % (rid, sp, tp)
- sg2.add_edge( v, w, label = label, length = length, score = score)
-
-
- # utg construction phase 1, identify all simple paths
- s_nodes = set()
- t_nodes = set()
- simple_nodes = set()
-
- all_nodes = sg2.nodes()
- for n in all_nodes:
- in_degree = len( sg2.in_edges(n) )
- out_degree = len( sg2.out_edges(n) )
- if in_degree == 1 and out_degree == 1:
- simple_nodes.add(n)
- else:
- if out_degree != 0:
- s_nodes.add(n)
- if in_degree != 0:
- t_nodes.add(n)
-
-
- free_edges = set(sg2.edges())
-
- if DEBUG_LOG_LEVEL > 1:
- for s in list(simple_nodes):
- print "simple_node", s
- for s in list(s_nodes):
- print "s_node", s
- for s in list(t_nodes):
- print "t_node", s
-
- for v,w in free_edges:
- if (reverse_end(w), reverse_end(v) ) not in free_edges:
- print "bug", v,w
- print oreverse_end(w), reverse_end(v)
-
- while len(free_edges) != 0:
- if len(s_nodes) != 0:
- n = s_nodes.pop()
- if DEBUG_LOG_LEVEL > 1:
- print "initial utg 1", n
- else:
- e = free_edges.pop()
- free_edges.add(e)
- n = e[0]
- if DEBUG_LOG_LEVEL > 1:
- print "initial utg 2", n
-
- path = []
- path_length =0
- path_score = 0
- for v, w in sg2.out_edges(n):
- if (v, w) not in free_edges:
- continue
- rv = reverse_end(v)
- rw = reverse_end(w)
-
- path_length = 0
- path_score = 0
- v0 = v
- w0 = w
- path = [v, w]
- path_edges = set()
- path_edges.add( (v, w) )
- path_length += edge_data[ (v, w) ][3]
- path_score += edge_data[ (v, w) ][4]
- free_edges.remove( (v, w) )
-
- r_path_length = 0
- r_path_score = 0
- rv0 = rv
- rw0 = rw
- r_path = [rv, rw] # need to reverse again
- r_path_edges = set()
- r_path_edges.add( (rw, rv) )
- r_path_length += edge_data[ (rw, rv) ][3]
- r_path_score += edge_data[ (rw, rv) ][4]
- free_edges.remove( (rw, rv) )
-
- while w in simple_nodes:
- w, w_ = sg2.out_edges(w)[0]
- if (w, w_) not in free_edges:
- break
- rw_, rw = reverse_end(w_), reverse_end(w)
-
- if ( rw_, rw ) in path_edges:
- break
-
- path.append(w_)
- path_edges.add( (w, w_) )
- path_length += edge_data[ (w, w_) ][3]
- path_score += edge_data[ (w, w_) ][4]
- free_edges.remove( (w, w_) )
-
- r_path.append(rw_)
- r_path_edges.add( (rw_, rw) )
- r_path_length += edge_data[ (rw_, rw) ][3]
- r_path_score += edge_data[ (rw_, rw) ][4]
- free_edges.remove( (rw_, rw) )
-
-
- w = w_
-
- simple_paths[ (v0, w0, path[-1]) ] = path_length, path_score, path
- r_path.reverse()
- assert r_path[0] == reverse_end(path[-1])
- simple_paths[ (r_path[0], rw0, rv0) ] = r_path_length, r_path_score, r_path
-
- if DEBUG_LOG_LEVEL > 1:
- print path_length, path_score, path
-
- dual_path[ (r_path[0], rw0, rv0) ] = (v0, w0, path[-1])
- dual_path[ (v0, w0, path[-1]) ] = (r_path[0], rw0, rv0)
-
-
-
- ug = nx.MultiDiGraph()
- u_edge_data = {}
- circular_path = set()
-
- for s, v, t in simple_paths:
- length, score, path = simple_paths[ (s, v, t) ]
- u_edge_data[ (s, t, v) ] = (length, score, path, "simple")
- if s != t:
- ug.add_edge(s, t, key = v, type_ = "simple", via = v, length = length, score = score)
- else:
- circular_path.add( (s, t, v) )
-
-
- if DEBUG_LOG_LEVEL > 1:
- with open("utg_data0","w") as f:
- for s, t, v in u_edge_data:
- rs = reverse_end(t)
- rt = reverse_end(s)
- rv = reverse_end(v)
- assert (rs, rt, rv) in u_edge_data
- length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
-
- if type_ == "compound":
- path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
- else:
- path_or_edges = "~".join( path_or_edges )
- print >>f, s, v, t, type_, length, score, path_or_edges
-
- # identify spurs in the utg graph
- # Currently, we use ad-hoc logic filtering out shorter utg, but we ca
- # add proper alignment comparison later to remove redundant utgs
-
- utg_spurs = set()
- all_nodes = ug.nodes()
-
- ug2 = ug.copy()
-
- s_candidates = set( )
- for v in ug2.nodes():
- if ug2.in_degree(v) == 0:
- s_candidates.add(v)
-
- while len(s_candidates) > 0:
- n = s_candidates.pop()
- if ug2.in_degree(n) != 0:
- continue
- n_ego_graph = nx.ego_graph( ug2, n, radius = 10 )
- n_ego_node_set = set( n_ego_graph.nodes() )
- for b_node in n_ego_graph.nodes():
- if ug2.in_degree(b_node) <= 1:
- continue
-
- with_extern_node = False
- b_in_nodes = [e[0] for e in ug2.in_edges(b_node)]
-
- if len(b_in_nodes) == 1:
- continue
-
- for v in b_in_nodes:
- if v not in n_ego_node_set:
- with_extern_node = True
- break
-
- if not with_extern_node:
- continue
-
- s_path = nx.shortest_path( ug2, n, b_node )
- v1 = s_path[0]
- total_length = 0
- for v2 in s_path[1:]:
- for s, t, v in ug2.out_edges(v1, keys=True):
- if t != v2:
- continue
- length, score, edges, type_ = u_edge_data[ (s, t, v) ]
- total_length += length
- v1 = v2
-
- if total_length >= 50000:
- continue
-
- v1 = s_path[0]
- for v2 in s_path[1:]:
- for s, t, v in ug2.out_edges(v1, keys=True):
- if t != v2:
- continue
- length, score, edges, type_ = u_edge_data[ (s, t, v) ]
- rs = reverse_end(t)
- rt = reverse_end(s)
- rv = reverse_end(v)
- try:
- ug2.remove_edge( s, t, key= v)
- ug2.remove_edge( rs, rt, key= rv)
- u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
- u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
- except Exception:
- pass
-
- if ug2.in_edges(v2) == 0:
- s_candidates.add(v2)
- v1 = v2
- break
-
- #phase 2, finding all "consistent" compound paths
- compound_paths = construct_compound_paths(ug2, u_edge_data)
- compound_path_file = open("c_path","w")
-
- ug2_edges = set(ug2.edges(keys = True))
- edges_to_remove = set()
- for s, v, t in compound_paths:
- width, length, score, bundle_edges = compound_paths[ (s, v, t) ]
- print >> compound_path_file, s,v,t, width, length, score, "|".join( [e[0]+"~"+e[2]+"~"+e[1] for e in bundle_edges] )
- for ss, tt, vv in bundle_edges:
- if (ss, tt, vv) in ug2_edges:
- edges_to_remove.add( (ss, tt, vv) )
-
-
- for s, t, v in edges_to_remove:
- ug2.remove_edge( s, t ,v )
- length, score, edges, type_ = u_edge_data[ (s, t, v) ]
- if type_ != "spur":
- u_edge_data[ (s, t, v) ] = length, score, edges, "contained"
-
-
- for s, v, t in compound_paths:
- width, length, score, bundle_edges = compound_paths[ (s, v, t) ]
- u_edge_data[ (s, t, v) ] = (length, score, bundle_edges, "compound")
- ug2.add_edge( s, t, key = v, via = v, type_="compound", length = length, score = score)
-
- assert v == "NA"
- rs = reverse_end(t)
- rt = reverse_end(s)
- assert (rs, v, rt) in compound_paths
- dual_path[ (s, v, t) ] = (rs, v, rt)
- dual_path[ (rs, v, rt) ] = (s, v, t)
-
- compound_path_file.close()
-
-
- # remove short utg using local flow consistent rule
- """
- short UTG like this can be removed, this kind of utg are likely artifects of repeats
- >____ _____>
- \__UTG_>__/
- <____/ \_____<
- """
- ug_edge_to_remove = set()
- for s, t, v in ug.edges(keys=True):
- if ug2.in_degree(s) == 1 and ug2.out_degree(s) == 2 and \
- ug2.in_degree(t) == 2 and ug2.out_degree(t) == 1:
- length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
- if length < 60000:
- rs = reverse_end(t)
- rt = reverse_end(s)
- rv = reverse_end(v)
- ug_edge_to_remove.add( (s, t, v) )
- ug_edge_to_remove.add( (rs, rt, rv) )
- for s, t, v in list(ug_edge_to_remove):
- ug2.remove_edge(s, t, key=v)
- length, score, edges, type_ = u_edge_data[ (s, t, v) ]
- u_edge_data[ (s, t, v) ] = length, score, edges, "repeat_bridge"
-
- ug = ug2
-
- with open("utg_data","w") as f:
- for s, t, v in u_edge_data:
- length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
-
- if v == "NA":
- path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
- else:
- path_or_edges = "~".join( path_or_edges )
- print >>f, s, v, t, type_, length, score, path_or_edges
-
- # contig construction from utgs
-
- s_nodes = set()
- t_nodes = set()
- simple_nodes = set()
- simple_out = set()
- simple_in = set()
-
- all_nodes = ug.nodes()
- for n in all_nodes:
- in_degree = len( ug.in_edges(n) )
- out_degree = len( ug.out_edges(n) )
- if in_degree == 1 and out_degree == 1:
- simple_nodes.add(n)
- else:
- if out_degree != 0:
- s_nodes.add(n)
- if in_degree != 0:
- t_nodes.add(n)
- if out_degree == 1:
- simple_out.add(n)
- if in_degree == 1:
- simple_in.add(n)
-
- all_nodes = set(all_nodes)
- c_path = []
-
- free_edges = set()
- for s, t, v in ug.edges(keys=True):
- free_edges.add( (s, t, v) )
-
- while len(free_edges) != 0:
-
- if len(s_nodes) != 0:
- n = s_nodes.pop()
- else:
- e = free_edges.pop()
- n = e[0]
-
- for s, t, v in ug.out_edges(n, keys=True):
- path_start = n
- path_end = None
- path_key = None
- path = []
- path_length = 0
- path_score = 0
- path_nodes = set()
- path_nodes.add(s)
- if DEBUG_LOG_LEVEL > 1:
- print "check 1", s, t, v
- path_key = t
- t0 = s
- while t in simple_out:
- if t in path_nodes:
- break
- rt = reverse_end(t)
- if rt in path_nodes:
- break
-
- path.append( (t0, t, v) )
- path_nodes.add(t)
- length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
- path_length += length
- path_score += score
- assert len( ug.out_edges( t, keys=True ) ) == 1 # t is "simple_out" node
- t0, t, v = ug.out_edges( t, keys=True )[0]
-
- path.append( (t0, t, v) )
- length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
- path_length += length
- path_score += score
- path_nodes.add(t)
- path_end = t
-
- c_path.append( (path_start, path_key, path_end, path_length, path_score, path, len(path)) )
- if DEBUG_LOG_LEVEL > 1:
- print "c_path", path_start, path_key, path_end, path_length, path_score, len(path)
- for e in path:
- if e in free_edges:
- free_edges.remove( e )
-
- if DEBUG_LOG_LEVEL > 1:
- print "left over edges:", len(free_edges)
-
-
-
- free_edges = set()
- for s, t, v in ug.edges(keys=True):
- free_edges.add( (s, t, v) )
-
-
- ctg_id = 0
-
- ctg_paths = open("ctg_paths","w")
-
- c_path.sort( key=lambda x: -x[3] )
-
-
- for path_start, path_key, path_end, p_len, p_score, path, n_edges in c_path:
- length = 0
- score = 0
- length_r = 0
- score_r = 0
-
- non_overlapped_path = []
- non_overlapped_path_r = []
- for s, t, v in path:
- if v != "NA":
- rs, rt, rv = reverse_end(t), reverse_end(s), reverse_end(v)
- else:
- rs, rt, rv = reverse_end(t), reverse_end(s), "NA"
- if (s, t, v) in free_edges and (rs, rt, rv) in free_edges:
- non_overlapped_path.append( (s,t,v) )
- non_overlapped_path_r.append( (rs, rt, rv) )
- length += u_edge_data[ (s, t, v) ][0]
- score += u_edge_data[ (s, t, v) ][1]
- length_r += u_edge_data[ (rs, rt, rv) ][0]
- score_r += u_edge_data[ (rs, rt, rv) ][1]
- else:
- break
-
- if len(non_overlapped_path) == 0:
- continue
- s0, t0, v0 = non_overlapped_path[0]
- end_node = non_overlapped_path[-1][1]
-
- print >> ctg_paths, "%06dF" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length, score, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path ] )
- non_overlapped_path_r.reverse()
- s0, t0, v0 = non_overlapped_path_r[0]
- end_node = non_overlapped_path_r[-1][1]
- print >> ctg_paths, "%06dR" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length_r, score_r, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path_r ] )
- ctg_id += 1
- for e in non_overlapped_path:
- if e in free_edges:
- free_edges.remove(e)
- for e in non_overlapped_path_r:
- if e in free_edges:
- free_edges.remove(e)
-
-
-
- for s, t, v in list(circular_path):
- length, score, path, type_ = u_edge_data[ (s, t, v) ]
- print >> ctg_paths, "%6d" % ctg_id, "ctg_circular", s+"~"+v+"~"+t, t, length, score, s+"~"+v+"~"+t
- ctg_id += 1
-
- ctg_paths.close()
diff --git a/FALCON_unzip/falcon_unzip/phasing.py b/FALCON_unzip/falcon_unzip/phasing.py
deleted file mode 100644
index 6bd3151..0000000
--- a/FALCON_unzip/falcon_unzip/phasing.py
+++ /dev/null
@@ -1,570 +0,0 @@
-from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
- makePypeLocalFile, fn, PypeTask)
-from falcon_kit.FastaReader import FastaReader
-import argparse
-import logging
-import os
-import re
-import shlex
-import subprocess
-import sys
-
-cigar_re = r"(\d+)([MIDNSHP=X])"
-
-def make_het_call(self):
-
- bam_fn = fn(self.bam_file)
- ctg_id = self.parameters["ctg_id"]
- ref_seq = self.parameters["ref_seq"]
- base_dir = self.parameters["base_dir"]
- vmap_fn = fn(self.vmap_file)
- vpos_fn = fn(self.vpos_file)
- q_id_map_fn = fn(self.q_id_map_file)
-
- p = subprocess.Popen(shlex.split("samtools view %s %s" % (bam_fn, ctg_id) ), stdout=subprocess.PIPE)
- pileup = {}
- q_id_map = {}
- q_max_id = 0
- q_id = 0
- q_name_to_id = {}
-
- try:
- os.makedirs("%s/%s" % (base_dir, ctg_id))
- except OSError:
- pass
-
- vmap = open(vmap_fn, "w")
- vpos = open(vpos_fn, "w")
-
- for l in p.stdout:
- l = l.strip().split()
- if l[0][0] == "@":
- continue
-
- QNAME = l[0]
- if QNAME not in q_name_to_id:
- q_id = q_max_id
- q_name_to_id[QNAME] = q_id
- q_max_id += 1
-
- q_id = q_name_to_id[QNAME]
- q_id_map[q_id] = QNAME
- FLAG = int(l[1])
- RNAME = l[2]
- POS = int(l[3]) - 1 # convert to zero base
- CIGAR = l[5]
- SEQ = l[9]
- rp = POS
- qp = 0
-
- skip_base = 0
- total_aln_pos = 0
- for m in re.finditer(cigar_re, CIGAR):
- adv = int(m.group(1))
- total_aln_pos += adv
-
- if m.group(2) == "S":
- skip_base += adv
-
- if 1.0 - 1.0 * skip_base / total_aln_pos < 0.1:
- continue
- if total_aln_pos < 2000:
- continue
-
- for m in re.finditer(cigar_re, CIGAR):
- adv = int(m.group(1))
- if m.group(2) == "S":
- qp += adv
- if m.group(2) == "M":
- matches = []
- for i in range(adv):
- matches.append( (rp, SEQ[qp]) )
- rp += 1
- qp += 1
- matches = matches[1:-1]
- for pos, b in matches:
- pileup.setdefault(pos, {})
- pileup[pos].setdefault(b, [])
- pileup[pos][b].append(q_id)
- elif m.group(2) == "I":
- for i in range(adv):
- qp += 1
- elif m.group(2) == "D":
- for i in range(adv):
- rp += 1
-
- pos_k = pileup.keys()
- pos_k.sort()
- th = 0.25
- for pos in pos_k:
- if pos < POS:
- if len(pileup[pos]) < 2:
- del pileup[pos]
- continue
- base_count = []
- total_count = 0
- for b in ["A", "C", "G", "T"]:
- count = len(pileup[pos].get(b,[]))
- base_count.append( (count, b) )
- total_count += count
- if total_count < 10:
- del pileup[pos]
- continue
-
- base_count.sort()
- base_count.reverse()
- p0 = 1.0 * base_count[0][0] / total_count
- p1 = 1.0 * base_count[1][0] / total_count
- if p0 < 1.0 - th and p1 > th:
- b0 = base_count[0][1]
- b1 = base_count[1][1]
- ref_base = ref_seq[pos]
- print >> vpos, pos+1, ref_base, total_count, " ".join(["%s %d" % (x[1], x[0]) for x in base_count])
- for q_id_ in pileup[pos][b0]:
- print >> vmap, pos+1, ref_base, b0, q_id_
- for q_id_ in pileup[pos][b1]:
- print >> vmap, pos+1, ref_base, b1, q_id_
- del pileup[pos]
-
-
- q_id_map_f = open(q_id_map_fn, "w")
- for q_id, q_name in q_id_map.items():
- print >> q_id_map_f, q_id, q_name
-
-
-def generate_association_table(self):
-
- vmap_fn = fn(self.vmap_file)
- atable_fn = fn(self.atable_file)
- ctg_id = self.parameters["ctg_id"]
- base_dir = self.parameters["base_dir"]
-
- vmap = {}
- v_positions = []
-
- with open(vmap_fn) as f:
- for l in f:
- l = l.strip().split()
- pos = int(l[0])
- ref_b = l[1]
- v_b = l[2]
- q_id = int(l[3])
- if (pos, ref_b) not in vmap:
- v_positions.append( (pos, ref_b) )
- vmap.setdefault( (pos, ref_b), {} )
- vmap[ (pos, ref_b) ].setdefault(v_b, [])
- vmap[ (pos, ref_b) ][v_b].append( q_id )
-
-
- #xary = []
- #yary = []
- with open(atable_fn, "w") as out_f:
- for i1 in xrange(len(v_positions)):
- link_count = 0
- for i2 in xrange(i1+1, len(v_positions)):
- pos1, rb1 = v_positions[i1]
- pos2, rb2 = v_positions[i2]
- if pos2 - pos1 > (1 << 16):
- continue
- ct = {}
- p1table = []
- p2table = []
- s1 = 0
- list1 = vmap[ (pos1, rb1) ].items()
- for b1, qids1 in list1:
- p1table.append( (b1, len(qids1) ) )
- s1 += len(qids1)
-
- s2 = 0
- list2 = vmap[ (pos2, rb2) ].items()
- for b2, qids2 in list2:
- p2table.append( (b2, len(qids2) ) )
- s2 += len(qids2)
-
- total_s = 0
- for b1, qids1 in list1:
- for b2, qids2 in list2:
- s = len(set(qids1) & set(qids2))
- ct[(b1,b2)] = s
- total_s += s
- if total_s < 6:
- continue
-
- b11 = p1table[0][0]
- b12 = p1table[1][0]
- b21 = p2table[0][0]
- b22 = p2table[1][0]
- print >> out_f, pos1, b11, b12, pos2, b21, b22, ct[(b11,b21)], ct[(b11,b22)], ct[(b12,b21)], ct[(b12,b22)]
-
-
- #xary.append(pos1)
- #yary.append(pos2)
- link_count += 1
- if link_count > 500:
- break
-
-def get_score( c_score, pos1, pos2, s1, s2 ):
- if pos1 > pos2:
- pos1, pos2 = pos2, pos1
- s1, s2 = s2, s1
- b11, b12 = s1
- b21, b22 = s2
- return c_score[ (pos1, pos2) ][ (b11+b21, b12+b22) ]
-
-def get_phased_blocks(self):
- vmap_fn = fn(self.vmap_file)
- atable_fn = fn(self.atable_file)
- p_variant_fn = fn(self.phased_variant_file)
-
- left_connect = {}
- right_connect = {}
-
- c_score = {}
- states = {}
- positions = set()
-
-
-
- ref_base = {}
- with open(vmap_fn) as f:
- for l in f:
- l = l.strip().split()
- pos = int(l[0])
- ref_b = l[1]
- v_b = l[2]
- q_id = int(l[3])
- ref_base[pos] = ref_b
-
- with open(atable_fn) as f:
- for l in f:
- l = l.strip().split()
- pos1, b11, b12, pos2, b21, b22, s11, s12, s21, s22 = l
- s11, s12, s21, s22 = int(s11), int(s12), int(s21), int(s22)
- if abs(s11+s22-s12-s21) < 6:
- continue
- pos1 = int(pos1)
- pos2 = int(pos2)
- positions.add(pos1)
- positions.add(pos2)
- right_connect.setdefault(pos1, [])
- right_connect[pos1].append(pos2)
- left_connect.setdefault(pos2, [])
- left_connect[pos2].append(pos1)
- c_score[ (pos1, pos2) ] = { (b11+b21, b12+b22): s11 + s22, (b12+b22, b11+b21): s11 + s22,
- (b12+b21, b11+b22): s12 + s21, (b11+b22, b12+b21): s12 + s21 }
-
-
- if pos1 not in states:
- st1 = (b11, b12)
- st2 = (b12, b11)
- score1 = 0
- score2 = 0
- for pp in left_connect.get(pos1,[]):
- if pp in states:
- st0 = states[pp]
- else:
- continue
- score1 += get_score( c_score, pp, pos1, st0, st1 )
- score2 += get_score( c_score, pp, pos1, st0, st2 )
-
- for pp in right_connect.get(pos1,[]):
- if pp in states:
- st0 = states[pp]
- else:
- continue
- score1 += get_score( c_score, pos1, pp, st1, st0 )
- score2 += get_score( c_score, pos1, pp, st2, st0 )
-
- if score1 >= score2:
- states[pos1] = st1
- else:
- states[pos1] = st2
-
- if pos2 not in states:
- st1 = (b21, b22)
- st2 = (b22, b21)
- score1 = 0
- score2 = 0
- for pp in left_connect.get(pos2,[]):
- if pp in states:
- st0 = states[pp]
- else:
- continue
- score1 += get_score( c_score, pp, pos2, st0, st1 )
- score2 += get_score( c_score, pp, pos2, st0, st2 )
-
- for pp in right_connect.get(pos2,[]):
- if pp in states:
- st0 = states[pp]
- else:
- continue
- score1 += get_score( c_score, pos2, pp, st1, st0 )
- score2 += get_score( c_score, pos2, pp, st2, st0 )
-
- if score1 >= score2:
- states[pos2] = st1
- else:
- states[pos2] = st2
-
- positions = list(positions)
- positions.sort()
-
-
- iter_count = 0
- while 1:
- iter_count += 1
- if iter_count > 10:
- break
- update_count = 0
- for p in positions:
- b1, b2 = states[p]
- st1 = (b1, b2)
- st2 = (b2, b1)
-
- score1 = 0
- score2 = 0
- for pp in left_connect.get(p,[]):
- st0 = states[pp]
- score1 += get_score( c_score, pp, p, st0 ,st1)
- score2 += get_score( c_score, pp, p, st0, st2)
-
- #for pp in right_connect.get(p,[]):
- # st0 = states[pp]
- # score1 += get_score( c_score, p, pp, st1 ,st0)
- # score2 += get_score( c_score, p, pp, st2, st0)
-
- if score1 >= score2:
- states[p] = st1
- else:
- states[p] = st2
- update_count += 1
- if update_count == 0:
- break
-
-
- right_extent = {}
- right_score = {}
- left_extent = {}
- left_score = {}
-
-
- for p in positions:
-
- left_extent[p] = p
- left_score[p] = 0
- if p in left_connect:
- left = p
- st0 = states[p]
- st0_ = st0[1], st0[0]
- for pp in left_connect[p]:
- st1 = states[pp]
- s = get_score( c_score, pp, p, st1, st0)
- s_ = get_score( c_score, pp, p, st1, st0_)
- left_score[p] += s - s_
- if s - s_ > 0 and pp < left:
- left = pp
- left_extent[p] = left
-
- right_extent[p] = p
- right_score[p] = 0
- if p in right_connect:
- right = p
- st0 = states[p]
- st0_ = st0[1], st0[0]
- for pp in right_connect[p]:
- st1 = states[pp]
- s = get_score( c_score, p, pp, st0, st1)
- s_ = get_score( c_score, p, pp, st0_, st1)
- right_score[p] += s - s_
- if s - s_ > 0 and pp > right:
- right = pp
- right_extent[p] = right
-
-
-
-
- phase_block_id = 1
- phase_blocks = {}
- pb = []
-
- max_right_ext = 0
- for p in positions:
- if right_score[p] < 10 or left_score[p] < 10:
- continue
- b1, b2 = states[p]
- if max_right_ext < left_extent[p]:
- if len(pb) > 3:
- phase_blocks[phase_block_id] = pb
- phase_block_id += 1
- pb = []
- pb.append( (p, b1, b2) )
- if right_extent[p] > max_right_ext:
- max_right_ext = right_extent[p]
- if len(pb) > 3:
- phase_blocks[phase_block_id] = pb
- else:
- phase_block_id -= 1
-
-
- with open(p_variant_fn, "w") as out_f:
- for pid in xrange(1, phase_block_id+1):
- if len(phase_blocks[pid]) == 0:
- continue
- min_ = min( [x[0] for x in phase_blocks[pid]] )
- max_ = max( [x[0] for x in phase_blocks[pid]] )
-
- print >>out_f, "P", pid, min_, max_, max_ - min_, len(phase_blocks[pid]), 1.0 * (max_-min_)/len(phase_blocks[pid])
- for p, b1, b2 in phase_blocks[pid]:
- rb = ref_base[p]
- print >>out_f, "V", pid, p, "%d_%s_%s" % (p,rb,b1), "%d_%s_%s" % (p,rb,b2), left_extent[p], right_extent[p], left_score[p], right_score[p]
-
-def get_phased_reads(self):
-
- q_id_map_fn = fn(self.q_id_map_file)
- vmap_fn = fn(self.vmap_file)
- p_variant_fn = fn(self.phased_variant_file)
- parameters = self.parameters
-
- ctg_id = parameters["ctg_id"]
-
- phased_read_fn = fn(self.phased_read_file)
-
- rid_map = {}
- with open(q_id_map_fn) as f:
- for l in f:
- l = l.strip().split()
- rid_map[int(l[0])] = l[1]
-
-
- read_to_variants = {}
- variant_to_reads = {}
- with open(vmap_fn) as f:
- for l in f:
- l = l.strip().split()
- variant = "_".join(l[:3])
- read_id = int(l[3])
- read_to_variants.setdefault(read_id, set())
- read_to_variants[read_id].add(variant)
- variant_to_reads.setdefault(variant, set())
- variant_to_reads[variant].add(read_id)
-
-
- variant_to_phase = {}
- with open(p_variant_fn) as f:
- for l in f:
- """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781"""
- l = l.strip().split()
- if l[0] != "V":
- continue
- pb_id = int(l[1])
- variant_to_phase[ l[3] ] = (pb_id, 0)
- variant_to_phase[ l[4] ] = (pb_id, 1)
-
- with open(phased_read_fn, "w") as out_f:
- for r in read_to_variants:
- vl = {}
- pl = set()
- for v in list( read_to_variants[r] ):
- if v in variant_to_phase:
- p = variant_to_phase[v]
- vl[ p ] = vl.get(p, 0) + 1
- pl.add(p[0])
- pl = list(pl)
- pl.sort()
- for p in pl:
- if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1:
- print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
- elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1:
- print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
-
-def phasing(args):
- bam_fn = args.bam
- fasta_fn = args.fasta
- ctg_id = args.ctg_id
- base_dir = args.base_dir
-
- ref_seq = ""
- for r in FastaReader(fasta_fn):
- rid = r.name.split()[0]
- if rid != ctg_id:
- continue
- ref_seq = r.sequence.upper()
-
- wf = PypeProcWatcherWorkflow(
- max_jobs=1,
- )
-
- bam_file = makePypeLocalFile(bam_fn)
- vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
- vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
- q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
- parameters = {}
- parameters["ctg_id"] = ctg_id
- parameters["ref_seq"] = ref_seq
- parameters["base_dir"] = base_dir
-
- make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
- outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
- parameters = parameters,
- ) (make_het_call)
-
- wf.addTasks([make_het_call_task])
-
-
-
-
- atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
- parameters = {}
- parameters["ctg_id"] = ctg_id
- parameters["base_dir"] = base_dir
- generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
- outputs = { "atable_file": atable_file },
- parameters = parameters,
- ) (generate_association_table)
-
- wf.addTasks([generate_association_table_task])
-
-
-
-
- phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
- get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
- outputs = { "phased_variant_file": phased_variant_file },
- ) (get_phased_blocks)
- wf.addTasks([get_phased_blocks_task])
-
-
-
-
- phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
- get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
- "q_id_map_file": q_id_map_file,
- "phased_variant_file": phased_variant_file },
- outputs = { "phased_read_file": phased_read_file },
- parameters = {"ctg_id": ctg_id},
- ) (get_phased_reads)
- wf.addTasks([get_phased_reads_task])
-
-
- wf.refreshTargets()
- #with open("fc_phasing_wf.dot", "w") as f:
- # print >>f, wf.graphvizDot
-
-def parse_args(argv):
- parser = argparse.ArgumentParser(description='phasing variants and reads from a bam file')
- # we can run this in parallel mode in the furture
- #parser.add_argument('--n_core', type=int, default=4,
- # help='number of processes used for generating consensus')
- parser.add_argument('--bam', type=str, help='path to sorted bam file', required=True)
- parser.add_argument('--fasta', type=str, help='path to the fasta file of contain the contig', required=True)
- parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
- parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
-
-
- args = parser.parse_args(argv[1:])
- return args
-
-def main(argv=sys.argv):
- logging.basicConfig()
- args = parse_args(argv)
- phasing(args)
diff --git a/FALCON_unzip/falcon_unzip/phasing_readmap.py b/FALCON_unzip/falcon_unzip/phasing_readmap.py
deleted file mode 100644
index 2e30c4b..0000000
--- a/FALCON_unzip/falcon_unzip/phasing_readmap.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import argparse
-import logging
-import os
-import re
-import sys
-
-
-def get_phasing_readmap(args):
-
- phased_reads = args.phased_reads
- read_map_dir = args.read_map_dir
- the_ctg_id = args.ctg_id
- base_dir = args.base_dir
-
- rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids')
- pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')
- rid_to_oid = open(rawread_id_file).read().split('\n') #daligner raw read id to the original ids
- pid_to_fid = open(pread_id_file).read().split('\n') #daligner pread id to the fake ids
-
- def pid_to_oid(pid):
- fid = pid_to_fid[int(pid)]
- rid = int(fid.split('/')[1])/10
- return rid_to_oid[int(rid)]
-
- rid_to_oid = open(rawread_id_file).read().split('\n') #daligner raw read id to the original ids
- pid_to_fid = open(pread_id_file).read().split('\n') #daligner pread id to the fake ids
-
-
- rid_to_phase = {}
- with open(phased_reads) as f:
- for row in f:
- row = row.strip().split()
- rid_to_phase[row[6]] = (int(row[2]), int(row[3]))
-
- arid_to_phase = {}
- map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
- with open(map_fn) as f:
- for row in f:
- row = row.strip().split()
- ctg_id = row[1]
- if not ctg_id.startswith(the_ctg_id):
- continue
- if int(row[3]) != 0: #not the best hit
- continue
- o_id = pid_to_oid(row[0])
- phase = rid_to_phase.get( o_id, (-1, 0) )
- arid_to_phase['%09d' % int(row[0])] = phase
-
- with open(os.path.join(base_dir, 'rid_to_phase.%s' % the_ctg_id),'w') as f:
- for arid, phase in arid_to_phase.items():
- print >>f, arid, the_ctg_id, phase[0], phase[1]
-
-
-
-def parse_args(argv):
- parser = argparse.ArgumentParser(description='mapping internal daligner read id to phase block and phase',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- # we can run this in parallel mode in the furture
- #parser.add_argument('--n_core', type=int, default=4,
- # help='number of processes used for generating consensus')
- parser.add_argument('--phased_reads', type=str, help='path to read vs. phase map', required=True)
- parser.add_argument('--read_map_dir', type=str, help='path to the read map directory', required=True)
- parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
- parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
-
- args = parser.parse_args(argv[1:])
- return args
-
-
-def main(argv=sys.argv):
- logging.basicConfig()
- args = parse_args(argv)
- get_phasing_readmap(args)
diff --git a/FALCON_unzip/falcon_unzip/rr_hctg_track.py b/FALCON_unzip/falcon_unzip/rr_hctg_track.py
deleted file mode 100644
index fa02434..0000000
--- a/FALCON_unzip/falcon_unzip/rr_hctg_track.py
+++ /dev/null
@@ -1,201 +0,0 @@
-from __future__ import print_function
-from falcon_kit.multiproc import Pool
-import falcon_kit.util.io as io
-import argparse
-import sys
-import glob
-import os
-from heapq import heappush, heappop, heappushpop
-
-Reader = io.CapturedProcessReaderContext
-
-
-def get_rid_to_ctg(fn):
- rid_to_ctg = {}
- with open(fn) as f:
- for row in f:
- row = row.strip().split()
- pid, rid, oid, ctg = row
- rid_to_ctg.setdefault( rid, set() )
- rid_to_ctg[ rid ].add(ctg)
- return rid_to_ctg
-
-def run_tr_stage1(db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase):
- cmd = 'LA4Falcon -m %s %s' % (db_fn, fn)
- reader = Reader(cmd)
- with reader:
- return fn, tr_stage1(reader.readlines, min_len, bestn, rid_to_ctg, rid_to_phase)
-
-def tr_stage1(readlines, min_len, bestn, rid_to_ctg, rid_to_phase):
- """
- for each read in the b-read column inside the LAS files, we
- keep top `bestn` hits with a priority queue through all overlaps
- """
-
- rtn = {}
- for l in readlines():
- l = l.strip().split()
- q_id, t_id = l[:2]
- overlap_len = -int(l[2])
- idt = float(l[3])
- q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
- t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
- if t_l < min_len:
- continue
- if q_id not in rid_to_ctg:
- continue
-
- t_phase = rid_to_phase[ int(t_id) ]
- if t_phase != None:
- ctg_id, block, phase = t_phase
- if block != -1:
- q_phase = rid_to_phase[ int(q_id) ]
- if q_phase != None:
- if q_phase[0] == ctg_id and q_phase[1] == block and q_phase[2] != phase:
- continue
-
- rtn.setdefault(t_id, [])
- if len(rtn[t_id]) < bestn:
- heappush(rtn[t_id], (overlap_len, q_id) )
- else:
- heappushpop(rtn[t_id], (overlap_len, q_id) )
-
- return rtn
-
-def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
- io.LOG('preparing tr_stage1')
- io.logstats()
- asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
- hasm_dir = os.path.abspath(os.path.join(base_dir, '3-unzip'))
- quiver_dir = os.path.abspath(os.path.join(base_dir, '4-quiver'))
- rid_to_ctg = get_rid_to_ctg(os.path.join(quiver_dir, 'read_maps', 'read_to_contig_map' ))
-
- phased_read_file = os.path.join(hasm_dir, 'all_phased_reads')
- oid_to_phase = {}
- with open(phased_read_file) as f:
- for row in f:
- row = row.strip().split()
- ctg_id, block, phase = row[1:4]
- oid = row[6]
- block = int(block)
- phase = int(phase)
- oid_to_phase[ oid ] = (ctg_id, block, phase)
- rid_to_phase = {}
- rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
- rid_to_oid = open(rawread_ids_fn).read().split('\n')
- rid_to_phase = [ None ] * len( rid_to_oid )
- for rid, oid in enumerate(rid_to_oid):
- rid_to_phase[rid] = oid_to_phase.get( oid, None )
-
-
- inputs = []
- for fn in file_list:
- inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) )
- """
- Aggregate hits from each individual LAS and keep the best n hit.
- Note that this does not guarantee that the final results is globally the best n hits espcially
- when the number of `bestn` is too small. In those case, if there is more hits from single LAS
- file, then we will miss some good hits.
- """
- bread_to_areads = {}
- for fn, res in exe_pool.imap(io.run_func, inputs):
- for k in res:
- bread_to_areads.setdefault(k, [])
- for item in res[k]:
- if len(bread_to_areads[k]) < bestn:
- heappush( bread_to_areads[k], item )
- else:
- heappushpop( bread_to_areads[k], item )
-
- #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n')
-
- """
- For each b-read, we find the best contig map throgh the b->a->contig map.
- """
- with open(os.path.join(quiver_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f:
- for bread in bread_to_areads:
-
- ctg_score = {}
- for s, rid in bread_to_areads[bread]:
- if rid not in rid_to_ctg: continue
-
- ctgs = rid_to_ctg[rid]
- for ctg in ctgs:
- ctg_score.setdefault(ctg, [0,0])
- ctg_score[ctg][0] += -s
- ctg_score[ctg][1] += 1
-
- #oid = rid_to_oid[int(bread)]
- ctg_score = ctg_score.items()
- ctg_score.sort( key = lambda k: k[1][0] )
- rank = 0
-
- for ctg, score_count in ctg_score:
- if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
- in_ctg = 1
- else:
- in_ctg = 0
- score, count = score_count
- #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
- print(bread, ctg, count, rank, score, in_ctg, file=out_f)
- rank += 1
-
-
-
-def try_run_track_reads(n_core, base_dir, min_len, bestn):
- io.LOG('starting track_reads')
-
- rawread_dir = os.path.abspath(os.path.join(base_dir, '0-rawreads'))
-
- # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
- file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las'))
- io.LOG('file list: %r' % file_list)
-
- # same, shoud we decide this as a parameter
- db_fn = os.path.join(rawread_dir, 'raw_reads.db')
- n_core = min(n_core, len(file_list))
- exe_pool = Pool(n_core)
- try:
- run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
- io.LOG('finished track_reads')
- except:
- io.LOG('terminating track_reads workers...')
- exe_pool.terminate()
- raise
-
-def track_reads(n_core, base_dir, min_len, bestn, debug, silent, stream):
- if debug:
- n_core = 0
- silent = False
- if silent:
- io.LOG = io.write_nothing
- if stream:
- global Reader
- Reader = io.StreamedProcessReaderContext
-
- try_run_track_reads(n_core, base_dir, min_len, bestn)
-
-def parse_args(argv):
- parser = argparse.ArgumentParser(description='scan the raw read overlap information to identify the best hit from the reads \
-to the contigs with read_to_contig_map generated by `fc_get_read_hctg_map` in `4-quiver/read_maps/read_to_contig_map`. Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--n_core', type=int, default=4,
- help='number of processes used for for tracking reads; '
- '0 for main process only')
- #parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
- #parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
- parser.add_argument('--base_dir', type=str, default="./", help='the base working dir of a FALCON assembly')
- parser.add_argument('--min_len', type=int, default=2500, help='min length of the reads')
- parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
- parser.add_argument('--debug', '-g', action='store_true', help='single-threaded, plus other aids to debugging')
- parser.add_argument('--silent', action='store_true', help='suppress cmd reporting on stderr')
- parser.add_argument('--bestn', type=int, default=40, help='keep best n hits')
- args = parser.parse_args(argv[1:])
- return args
-
-def main(argv=sys.argv):
- args = parse_args(argv)
- track_reads(**vars(args))
-
-if __name__ == '__main__':
- main()
diff --git a/FALCON_unzip/falcon_unzip/run_quiver.py b/FALCON_unzip/falcon_unzip/run_quiver.py
deleted file mode 100644
index 4eb20e6..0000000
--- a/FALCON_unzip/falcon_unzip/run_quiver.py
+++ /dev/null
@@ -1,394 +0,0 @@
-from falcon_kit import run_support as support
-from pypeflow.simple_pwatcher_bridge import (
- PypeLocalFile, makePypeLocalFile, fn,
- PypeTask,
- PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
-from falcon_kit.FastaReader import FastaReader
-import glob
-import json
-import logging
-import os
-import pprint
-import re
-import sys
-import time
-import ConfigParser
-
-
-LOG = logging.getLogger(__name__)
-
-def system(call, check=False):
- LOG.debug('$(%s)' %repr(call))
- rc = os.system(call)
- msg = 'Call %r returned %d.' % (call, rc)
- if rc:
- LOG.warning(msg)
- if check:
- raise Exception(msg)
- else:
- LOG.debug(msg)
- return rc
-
-def mkdir(d):
- if not os.path.isdir(d):
- os.makedirs(d)
-
-def task_track_reads(self):
- job_done = fn(self.job_done)
- wd = self.parameters['wd']
- config = self.parameters['config']
- input_bam_fofn = config['input_bam_fofn']
- sge_track_reads = config['sge_track_reads']
- script_dir = os.path.join(wd)
- script_fn = os.path.join(script_dir, 'track_reads_h.sh')
-
- script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-cd {wd}
-hostname
-date
-fc_get_read_hctg_map.py --basedir ../..
-fc_rr_hctg_track.py --base_dir ../..
-mkdir -p 4-quiver/reads/
-fc_select_reads_from_bam.py --basedir ../.. {input_bam_fofn}
-date
-touch {job_done}
-""".format(**locals())
-
- with open(script_fn,'w') as script_file:
- script_file.write(script)
- self.generated_script_fn = script_fn
- #job_data['sge_option'] = sge_track_reads
-
-
-def task_run_quiver(self):
-
- ref_fasta = fn(self.ref_fasta)
- read_sam = fn(self.read_sam)
-
- cns_fasta = fn(self.cns_fasta)
- cns_fastq = fn(self.cns_fastq)
- job_done = fn(self.job_done)
-
- job_uid = self.parameters['job_uid']
- wd = self.parameters['wd']
- config = self.parameters['config']
- ctg_id = self.parameters['ctg_id']
-
- smrt_bin = config['smrt_bin']
- sge_quiver = config['sge_quiver']
- job_type = config['job_type']
- samtools = os.path.join(smrt_bin, 'samtools')
- pbalign = os.path.join(smrt_bin, 'pbalign')
- makePbi = os.path.join(smrt_bin, 'makePbi')
- variantCaller = os.path.join(smrt_bin, 'variantCaller')
-
- script_dir = os.path.join(wd)
- script_fn = os.path.join(script_dir , 'cns_%s.sh' % (ctg_id))
-
- script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-hostname
-date
-cd {wd}
-
-{samtools} faidx {ref_fasta}
-{samtools} view -b -S {read_sam} > {ctg_id}.bam
-{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\
- --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\
- --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\
- {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam
-#{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam
-({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\
- -o {cns_fasta} -o {cns_fastq}) || echo quvier failed
-date
-touch {job_done}
-""".format(**locals())
-
- with open(script_fn,'w') as script_file:
- script_file.write(script)
- self.generated_script_fn = script_fn
- #job_data['sge_option'] = sge_quiver
-
-def rm(f):
- system('rm -f {}'.format(f))
-def touch(f):
- system('touch {}'.format(f))
-def task_cns_zcat(self):
- gathered_p_ctg_fn = fn(self.gathered_p_ctg)
- gathered_h_ctg_fn = fn(self.gathered_h_ctg)
-
- cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta)
- cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq)
- cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta)
- cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq)
- job_done_fn = fn(self.job_done)
-
- rm(cns_p_ctg_fasta_fn)
- touch(cns_p_ctg_fasta_fn)
- rm(cns_p_ctg_fastq_fn)
- touch(cns_p_ctg_fastq_fn)
- with open(gathered_p_ctg_fn) as ifs:
- for line in ifs:
- cns_fasta_fn, cns_fastq_fn = line.split()
- system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format(**locals()))
- system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format(**locals()))
- with open(gathered_p_ctg_fn) as ifs:
- for line in ifs:
- cns_fasta_fn, cns_fastq_fn = line.split()
- rm(cns_fasta_fn)
- rm(cns_fasta_fn)
- rm(cns_h_ctg_fasta_fn)
- touch(cns_h_ctg_fasta_fn)
- rm(cns_h_ctg_fastq_fn)
- touch(cns_h_ctg_fastq_fn)
- with open(gathered_h_ctg_fn) as ifs:
- for line in ifs:
- cns_fasta_fn, cns_fastq_fn = line.split()
- system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format(**locals()))
- system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format(**locals()))
- with open(gathered_h_ctg_fn) as ifs:
- for line in ifs:
- cns_fasta_fn, cns_fastq_fn = line.split()
- rm(cns_fasta_fn)
- rm(cns_fasta_fn)
-
- touch(job_done_fn)
-
-def task_scatter_quiver(self):
- p_ctg_fn = fn(self.p_ctg_fa)
- h_ctg_fn = fn(self.h_ctg_fa)
- out_json = fn(self.scattered_quiver_json)
-
- ref_seq_data = {}
-
- # I think this will crash if the file is empty. Maybe that is ok.
- p_ctg_fa = FastaReader(p_ctg_fn)
- ctg_types = {}
- for r in p_ctg_fa:
- rid = r.name.split()[0]
- ref_seq_data[rid] = r.sequence
- ctg_types[rid] = 'p'
-
-
- # I think this will crash if the file is empty. Maybe that is ok.
- h_ctg_fa = FastaReader(h_ctg_fn)
- for r in h_ctg_fa:
- rid = r.name.split()[0]
- ref_seq_data[rid] = r.sequence
- ctg_types[rid] = 'h'
-
- ctg_ids = sorted(ref_seq_data.keys())
- #p_ctg_out=[]
- #h_ctg_out=[]
- #job_done_plfs = {}
- jobs = []
- for ctg_id in ctg_ids:
- sequence = ref_seq_data[ctg_id]
- m_ctg_id = ctg_id.split('-')[0]
- wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
- ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
- read_sam = os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id))
- #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
- #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
- #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
-
- if os.path.exists(read_sam): # TODO(CD): Ask Jason what we should do if missing SAM. And what about network latency?
- #if ctg_types[ctg_id] == 'p':
- # p_ctg_out.append( (cns_fasta, cns_fastq) )
- #if ctg_types[ctg_id] == 'h':
- # h_ctg_out.append( (cns_fasta, cns_fastq) )
- mkdir(wd)
- if not os.path.exists(fn(ref_fasta)):
- with open(fn(ref_fasta),'w') as f:
- print >>f, '>'+ctg_id
- print >>f, sequence
- #parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
- #make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam},
- # outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
- # parameters = parameters,
- # )
- #quiver_task = make_quiver_task(task_run_quiver)
- #wf.addTask(quiver_task)
- #job_done_plfs['{}'.format(ctg_id)] = job_done
- new_job = {}
- new_job['ctg_id'] = ctg_id
- jobs.append(new_job)
- open(out_json, 'w').write(json.dumps(jobs))
-
-def create_quiver_jobs(scattered_quiver_plf):
- scattered_quiver_fn = fn(scattered_quiver_plf)
- jobs = json.loads(open(scattered_quiver_fn).read())
- #ctg_ids = sorted(jobs['ref_seq_data'])
- p_ctg_out=[]
- h_ctg_out=[]
- job_done_plfs = {}
- for job in jobs:
- ctg_id = job['ctg_id']
- m_ctg_id = ctg_id.split('-')[0]
- wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
- ref_fasta = makePypeLocalFile(os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)))
- read_sam = makePypeLocalFile(os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id)))
- cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
- cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
- job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
-
- if os.path.exists(fn(read_sam)): # TODO(CD): Ask Jason what we should do if missing SAM.
- if ctg_types[ctg_id] == 'p':
- p_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
- elif ctg_types[ctg_id] == 'h':
- h_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
- else:
- LOG.warning('Type is {!r}, not "p" or "h". Why are we running Quiver?'.format(ctg_types[ctg_id]))
- parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
- make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam,
- 'scattered_quiver': scattered_quiver_plf,
- },
- outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
- parameters = parameters,
- )
- quiver_task = make_quiver_task(task_run_quiver)
- wf.addTask(quiver_task)
- job_done_plfs['{}'.format(ctg_id)] = job_done
- #sge_quiver = config['sge_quiver']
- return p_ctg_out, h_ctg_out, job_done_plfs
-
-def task_gather_quiver(self):
- """We wrote the "gathered" files during task construction.
- """
- job_done_fn = fn(self.job_done)
- touch(job_done_fn)
-
-
-def main(argv=sys.argv):
- global LOG
- LOG = support.setup_logger(None)
-
-
- if len(sys.argv) < 2:
- print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
- sys.exit(1)
-
- config_fn = sys.argv[1]
- config_absbasedir = os.path.dirname(os.path.abspath(config_fn))
-
- config = ConfigParser.ConfigParser()
- config.read(config_fn)
-
-
- job_type = 'SGE'
- if config.has_option('General', 'job_type'):
- job_type = config.get('General', 'job_type')
-
- sge_track_reads = ' -pe smp 12 -q bigmem'
- if config.has_option('Unzip', 'sge_track_reads'):
- sge_track_reads = config.get('Unzip', 'sge_track_reads')
-
- sge_quiver = ' -pe smp 24 -q bigmem '
- if config.has_option('Unzip', 'sge_quiver'):
- sge_quiver = config.get('Unzip', 'sge_quiver')
-
- smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
- if config.has_option('Unzip', 'smrt_bin'):
- smrt_bin = config.get('Unzip', 'smrt_bin')
-
- input_bam_fofn = 'input_bam.fofn'
- if config.has_option('Unzip', 'input_bam_fofn'):
- input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
- if not os.path.isabs(input_bam_fofn):
- input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)
-
-
- quiver_concurrent_jobs = 8
- if config.has_option('Unzip', 'quiver_concurrent_jobs'):
- quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs')
-
- config = {'job_type': job_type,
- 'sge_quiver': sge_quiver,
- 'sge_track_reads': sge_track_reads,
- 'input_bam_fofn': input_bam_fofn,
- 'smrt_bin': smrt_bin}
- LOG.info('config={}'.format(pprint.pformat(config)))
-
- #support.job_type = 'SGE' #tmp hack until we have a configuration parser
-
-
- wf = PypeProcWatcherWorkflow(
- max_jobs=quiver_concurrent_jobs,
- )
-
- abscwd = os.path.abspath('.')
- parameters = {'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'), 'config': config}
- hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention
- track_reads_h_done_plf = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_h_done'))
- make_track_reads_task = PypeTask(inputs = {'hasm_done': hasm_done_plf},
- outputs = {'job_done': track_reads_h_done_plf},
- parameters = parameters,
- )
- track_reads_task = make_track_reads_task(task_track_reads)
- #sge_track_reads = config['sge_track_reads']
-
- wf.addTask(track_reads_task)
-
- scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json')
- make_task = PypeTask(
- inputs = {
- 'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
- 'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
- 'track_reads_h_done': track_reads_h_done_plf,
- },
- outputs = {
- 'scattered_quiver_json': scattered_quiver_plf,
- },
- parameters = {},
- )
- wf.addTask(make_task(task_scatter_quiver))
- wf.refreshTargets()
-
- p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(scattered_quiver_plf)
-
- gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
- gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
- gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
- mkdir('4-quiver/cns_gather')
- with open(fn(gathered_p_ctg_plf), 'w') as ifs:
- for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
- ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
- with open(fn(gathered_h_ctg_plf), 'w') as ifs:
- for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
- ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
-
- make_task = PypeTask(
- inputs = job_done_plfs,
- outputs = {
- 'job_done': gather_done_plf,
- },
- parameters = {},
- )
- wf.addTask(make_task(task_gather_quiver))
- wf.refreshTargets()
-
- cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta')
- cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq')
- cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta')
- cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq')
- zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
- make_task = PypeTask(
- inputs = {
- 'gathered_p_ctg': gathered_p_ctg_plf,
- 'gathered_h_ctg': gathered_h_ctg_plf,
- 'gather_done': gather_done_plf,
- },
- outputs = {
- 'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
- 'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
- 'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
- 'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
- 'job_done': zcat_done_plf,
- },
- )
- wf.addTask(make_task(task_cns_zcat))
-
- wf.refreshTargets()
diff --git a/FALCON_unzip/falcon_unzip/select_reads_from_bam.py b/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
deleted file mode 100644
index 03cbde1..0000000
--- a/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pysam
-
-import argparse
-import glob
-import os
-import sys
-
-def select_reads_from_bam(input_bam_fofn_fn, asm_dir, hasm_dir, quiver_dir):
- read_partition = {}
- read_to_ctgs = {}
-
- rawread_to_contigs_fn = os.path.join(quiver_dir, 'read_maps', 'rawread_to_contigs')
- rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
- rid_to_oid = open(rawread_ids_fn).read().split('\n')
- with open(rawread_to_contigs_fn) as f:
- for row in f:
- row = row.strip().split()
- if int(row[3]) >= 1: #keep top one hits
- continue
- ctg_id = row[1]
- if ctg_id == 'NA':
- continue
- read_partition.setdefault( ctg_id, set())
- r_id = row[0]
- o_id = rid_to_oid[ int(r_id) ]
- read_partition[ ctg_id ].add(o_id)
- read_to_ctgs.setdefault(o_id, [])
- read_to_ctgs[ o_id ].append( (int(row[4]) ,ctg_id) )
- print "num read_partitions:", len(read_partition)
- print "num read_to_ctgs:", len(read_to_ctgs)
-
- header = None
- fofn_basedir = os.path.normpath(os.path.dirname(input_bam_fofn_fn))
- def abs_fn(maybe_rel_fn):
- if os.path.isabs(maybe_rel_fn):
- return maybe_rel_fn
- else:
- return os.path.join(fofn_basedir, maybe_rel_fn)
- for row in open(input_bam_fofn_fn):
- fn = abs_fn(row.strip())
- samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
- if header == None:
- header = samfile.header
- else:
- header['RG'].extend(samfile.header['RG'])
- samfile.close()
-
- PG = header.pop('PG') #remove PG line as there might be a bug that generates no readable chrs
- #print PG
-
- #base_dir = os.getcwd()
- #outfile = pysam.AlignmentFile( os.path.join(base_dir, 'header.sam' ), 'wh', header=header )
- #outfile.close()
-
- ctgs = read_partition.keys()
- ctgs.sort()
- selected_ctgs = set()
- for ctg in ctgs:
- picked_reads = read_partition[ ctg ]
- print "ctg, len:", ctg, len(picked_reads) # TODO: Is this for debugging?
- if len(picked_reads) > 20:
- selected_ctgs.add(ctg)
-
- outfile = {}
-
- sam_dir = os.path.join(quiver_dir, 'reads')
- mkdir(sam_dir)
- for row in open(input_bam_fofn_fn):
- fn = abs_fn(row.strip())
- samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
- for r in samfile.fetch( until_eof = True ):
- if r.query_name not in read_to_ctgs:
- #print "Missing:", r.query_name
- continue
- ctg_list = read_to_ctgs[ r.query_name ]
- ctg_list.sort()
- score, ctg = ctg_list[0]
- if ctg not in selected_ctgs:
- #print "Not selected:", ctg
- continue
- if ctg not in outfile:
- samfile_fn = os.path.join(quiver_dir, 'reads', '%s.sam' % ctg)
- print >>sys.stderr, 'samfile_fn:{!r}'.format(samfile_fn)
- outfile[ctg] = pysam.AlignmentFile(samfile_fn, 'wh', header=header)
- outfile[ctg].write(r)
- samfile.close()
-
- for ctg in outfile:
- outfile[ctg].close()
-
-def mkdir(d):
- if not os.path.isdir(d):
- os.makedirs(d)
-
-def parse_args(argv):
- parser = argparse.ArgumentParser(description='Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--basedir', type=str, default='./', help='the base working dir of a FALCON assembly')
- parser.add_argument('input_bam_fofn', type=str, help='File of BAM filenames. Paths are relative to dir of FOFN, not CWD.')
- args = parser.parse_args(argv[1:])
- return args
-
-def main(argv=sys.argv):
- args = parse_args(argv)
- basedir = args.basedir
- input_bam_fofn = args.input_bam_fofn
- #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
- #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
- asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
- hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
- quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
-
- select_reads_from_bam(input_bam_fofn, asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/unzip.py b/FALCON_unzip/falcon_unzip/unzip.py
deleted file mode 100644
index e585fbc..0000000
--- a/FALCON_unzip/falcon_unzip/unzip.py
+++ /dev/null
@@ -1,357 +0,0 @@
-from falcon_kit import run_support as support
-from pypeflow.simple_pwatcher_bridge import (
- PypeLocalFile, makePypeLocalFile, fn,
- PypeTask,
- PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
-from falcon_kit.FastaReader import FastaReader
-import glob
-import logging
-import os
-import re
-import sys
-import time
-import ConfigParser
-
-LOG = logging.getLogger(__name__)
-
-def system(call, check=False):
- LOG.debug('$(%s)' %repr(call))
- rc = os.system(call)
- msg = 'Call %r returned %d.' % (call, rc)
- if rc:
- LOG.warning(msg)
- if check:
- raise Exception(msg)
- else:
- LOG.debug(msg)
- return rc
-
-def mkdir(d):
- if not os.path.isdir(d):
- os.makedirs(d)
-
-def task_track_reads(self):
- job_done = fn(self.job_done)
- wd = self.parameters['wd']
- config = self.parameters['config']
- sge_track_reads = config['sge_track_reads']
- script_dir = os.path.join(wd)
- script_fn = os.path.join(script_dir , 'track_reads.sh')
- topdir = '../..'
-
- script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-hostname
-date
-cd {topdir}
-python -m falcon_kit.mains.get_read_ctg_map
-python -m falcon_kit.mains.rr_ctg_track
-python -m falcon_kit.mains.pr_ctg_track
-#mkdir -p 3-unzip/reads/
-python -m falcon_kit.mains.fetch_reads
-cd {wd}
-date
-touch {job_done}
-""".format(**locals())
-
- with open(script_fn,'w') as script_file:
- script_file.write(script)
- self.generated_script_fn = script_fn
- #job_data['sge_option'] = sge_track_reads
-
-
-def task_run_blasr(self):
- job_done = fn(self.job_done)
- ref_fasta = fn(self.ref_fasta)
- read_fasta = fn(self.read_fasta)
-
- job_uid = self.parameters['job_uid']
- wd = self.parameters['wd']
- ctg_id = self.parameters['ctg_id']
-
- config = self.parameters['config']
- smrt_bin = config['smrt_bin']
- sge_blasr_aln = config['sge_blasr_aln']
- job_type = config['job_type']
- blasr = os.path.join(smrt_bin, 'blasr')
- samtools = os.path.join(smrt_bin, 'samtools')
-
-
- script_dir = os.path.join(wd)
- script_fn = os.path.join(script_dir , 'aln_{ctg_id}.sh'.format(ctg_id = ctg_id))
-
- script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-cd {wd}
-hostname
-date
-cd {wd}
-time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\
- -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\
- -minMatch 12 -nproc 24 -sam -out tmp_aln.sam
-{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted
-{samtools} index {ctg_id}_sorted.bam
-rm tmp_aln.sam
-date
-touch {job_done}
-""".format(**locals())
-
- with open(script_fn,'w') as script_file:
- script_file.write(script)
- self.generated_script_fn = script_fn
- #job_data['sge_option'] = sge_blasr_aln
-
-
-def task_phasing(self):
- ref_fasta = fn(self.ref_fasta)
- aln_bam = fn(self.aln_bam)
-
- job_done = fn(self.job_done)
-
- job_uid = self.parameters['job_uid']
- wd = self.parameters['wd']
- ctg_id = self.parameters['ctg_id']
-
- config = self.parameters['config']
- sge_phasing = config['sge_phasing']
- job_type = config['job_type']
-
- script_dir = os.path.join(wd)
- script_fn = os.path.join(script_dir , 'p_%s.sh' % (ctg_id))
-
- script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-cd {wd}
-hostname
-date
-cd {wd}
-fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir ..
-fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads
-date
-touch {job_done}
-""".format(**locals())
-
- with open(script_fn,'w') as script_file:
- script_file.write(script)
- self.generated_script_fn = script_fn
- #job_data['sge_option'] = sge_phasing
-
-
-def task_hasm(self):
- rid_to_phase_all = fn(self.rid_to_phase_all)
- job_done = fn(self.job_done)
- config = self.parameters['config']
- sge_hasm = config['sge_hasm']
-
- wd = self.parameters['wd']
-
- job_type = config['job_type']
-
- script_dir = os.path.join(wd)
- script_fn = os.path.join(script_dir , 'hasm.sh')
-
- las_fofn = '../../2-asm-falcon/las.fofn'
- las_fofn = '../../1-preads_ovl/merge-gather/las.fofn'
- script = """\
-set -vex
-trap 'touch {job_done}.exit' EXIT
-hostname
-date
-cd {wd}
-
-fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl
-fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
-if [ -e ../../1-preads_ovl/preads4falcon.fasta ];
-then
- ln -sf ../../1-preads_ovl/preads4falcon.fasta .
-else
- ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta .
-fi
-fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta
-
-# more script -- a little bit hacky here, we should improve
-
-WD=$PWD
-for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done
-
-## prepare for quviering the haplotig
-cd $WD/..
-if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi
-if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi
-if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi
-if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi
-if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi
-
-find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
-find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
-find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
-find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
-find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
-find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
-cd ../
-date
-touch {job_done}
-""".format(**locals())
-
- with open(script_fn,'w') as script_file:
- script_file.write(script)
- self.generated_script_fn = script_fn
- #job_data['sge_option'] = sge_hasm
-
-def unzip_all(config):
- unzip_concurrent_jobs = config['unzip_concurrent_jobs']
- wf = PypeProcWatcherWorkflow(
- max_jobs=unzip_concurrent_jobs,
- )
- wf.max_jobs = unzip_concurrent_jobs
-
- ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
- falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
- wdir = os.path.abspath('./3-unzip/reads')
- parameters = {'wd': wdir, 'config': config}
-
- job_done = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_done'))
- make_track_reads_task = PypeTask(inputs = {'falcon_asm_done': falcon_asm_done},
- outputs = {'job_done': job_done, 'ctg_list_file': ctg_list_file},
- parameters = parameters,
- wdir = wdir,
- )
- track_reads_task = make_track_reads_task(task_track_reads)
-
- wf.addTask(track_reads_task)
- wf.refreshTargets() #force refresh now, will put proper dependence later
-
- ctg_ids = []
- with open('./3-unzip/reads/ctg_list') as f:
- for row in f:
- row = row.strip()
- ctg_ids.append(row)
-
- aln1_outs = {}
-
- all_ctg_out = {}
-
- for ctg_id in ctg_ids:
- # inputs
- ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
- read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))
-
- # outputs
- wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))
- #mkdir(wd)
- blasr_dir = os.path.join(wd, 'blasr')
- ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))
- job_done = makePypeLocalFile(os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id = ctg_id)))
-
- parameters = {'job_uid':'aln-'+ctg_id, 'wd': blasr_dir, 'config':config, 'ctg_id': ctg_id}
- make_blasr_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_fasta': read_fasta},
- outputs = {'ctg_aln_out': ctg_aln_out, 'job_done': job_done},
- parameters = parameters,
- )
- blasr_task = make_blasr_task(task_run_blasr)
- aln1_outs[ctg_id] = (ctg_aln_out, job_done)
- wf.addTask(blasr_task)
-
- phasing_dir = os.path.join(wd, 'phasing')
- job_done = makePypeLocalFile(os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id = ctg_id)))
- rid_to_phase_out = makePypeLocalFile(os.path.join(wd, 'rid_to_phase.{ctg_id}'.format(ctg_id = ctg_id))) # TODO: ???
- all_ctg_out[ 'r2p.{ctg_id}'.format(ctg_id = ctg_id) ] = rid_to_phase_out # implicit output?
-
- parameters = {'job_uid':'ha-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id}
- make_phasing_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'aln_bam':ctg_aln_out},
- outputs = {'job_done': job_done},
- parameters = parameters,
- )
- phasing_task = make_phasing_task(task_phasing)
- wf.addTask(phasing_task)
-
- wf.refreshTargets()
-
- hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
- #mkdir(hasm_wd)
- rid_to_phase_all = makePypeLocalFile(os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
- task = PypeTask(inputs = all_ctg_out, outputs = {'rid_to_phase_all': rid_to_phase_all},
- ) (get_rid_to_phase_all)
- wf.addTask(task)
-
- parameters['wd'] = hasm_wd
- job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
- make_hasm_task = PypeTask(inputs = {'rid_to_phase_all': rid_to_phase_all},
- outputs = {'job_done': job_done},
- parameters = parameters,
- )
- hasm_task = make_hasm_task(task_hasm)
-
- wf.addTask(hasm_task)
-
- wf.refreshTargets()
-
-def get_rid_to_phase_all(self):
- # Tasks must be at module scope now.
- rid_to_phase_all_fn = fn(self.rid_to_phase_all)
- inputs_fn = [ fn(f) for f in self.inputs.values() ]
- inputs_fn.sort()
- output = []
- for fname in inputs_fn:
- output.extend(open(fname).read())
-
- out = open(rid_to_phase_all_fn, 'w')
- out.write(''.join(output))
- out.close()
-
-def main(argv=sys.argv):
- global LOG
- LOG = support.setup_logger(None)
-
-
- if len(argv) < 2 or argv[1].startswith('-'):
- print 'you need to provide a configuration file to specific a couple cluster running environment'
- sys.exit(1)
-
- config_fn = argv[1]
-
- config = ConfigParser.ConfigParser()
- config.read(config_fn)
-
- job_type = 'SGE'
- if config.has_option('General', 'job_type'):
- job_type = config.get('General', 'job_type')
-
- sge_blasr_aln = ' -pe smp 24 -q bigmem '
- if config.has_option('Unzip', 'sge_blasr_aln'):
- sge_blasr_aln = config.get('Unzip', 'sge_blasr_aln')
-
- smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
- if config.has_option('Unzip', 'smrt_bin'):
- smrt_bin = config.get('Unzip', 'smrt_bin')
-
- sge_phasing = ' -pe smp 12 -q bigmem'
- if config.has_option('Unzip', 'sge_phasing'):
- sge_phasing = config.get('Unzip', 'sge_phasing')
-
- sge_hasm = ' -pe smp 48 -q bigmem'
- if config.has_option('Unzip', 'sge_hasm'):
- sge_hasm = config.get('Unzip', 'sge_hasm')
-
- sge_track_reads = ' -pe smp 12 -q bigmem'
- if config.has_option('Unzip', 'sge_track_reads'):
- sge_track_reads = config.get('Unzip', 'sge_track_reads')
-
- unzip_concurrent_jobs = 8
- if config.has_option('Unzip', 'unzip_concurrent_jobs'):
- unzip_concurrent_jobs = config.getint('Unzip', 'unzip_concurrent_jobs')
-
- config = {'job_type': job_type,
- 'sge_blasr_aln': sge_blasr_aln,
- 'smrt_bin': smrt_bin,
- 'sge_phasing': sge_phasing,
- 'sge_hasm': sge_hasm,
- 'sge_track_reads': sge_track_reads,
- 'unzip_concurrent_jobs':unzip_concurrent_jobs}
-
- #support.job_type = 'SGE' #tmp hack until we have a configuration parser
-
- unzip_all(config)
diff --git a/FALCON_unzip/setup.py b/FALCON_unzip/setup.py
deleted file mode 100755
index b54466c..0000000
--- a/FALCON_unzip/setup.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-
-from setuptools import setup
-
-from distutils.core import Extension
-
-import glob
-
-install_requires=[ "networkx >= 1.7", "pysam >= 0.8.4" ]
-
-scripts = glob.glob("src/py_scripts/*.py")
-
-setup(name='falcon_unzip',
- version='0.1.0',
- description='Falcon unzip',
- author='Jason Chin',
- author_email='jchin at pacificbiosciences.com',
- packages=['falcon_unzip'],
- package_dir={'falcon_unzip':'falcon_unzip/'},
- scripts = scripts,
- zip_safe = False,
- install_requires=install_requires
- )
-
diff --git a/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
deleted file mode 100644
index 71b60f5..0000000
--- a/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.dedup_h_tigs import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py b/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
deleted file mode 100644
index 09af64b..0000000
--- a/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.get_read_hctg_map import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
deleted file mode 100644
index e9fcb1f..0000000
--- a/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.graphs_to_h_tigs import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py b/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
deleted file mode 100644
index c3a38d2..0000000
--- a/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from falcon_unzip.ovlp_filter_with_phase import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
-
-
diff --git a/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py b/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
deleted file mode 100644
index 2c58c5a..0000000
--- a/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from falcon_unzip.phased_ovlp_to_graph import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
-
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing.py b/FALCON_unzip/src/py_scripts/fc_phasing.py
deleted file mode 100644
index 70c4711..0000000
--- a/FALCON_unzip/src/py_scripts/fc_phasing.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from falcon_unzip.phasing import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
-
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py b/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
deleted file mode 100644
index 3fa3592..0000000
--- a/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.phasing_readmap import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_quiver.py b/FALCON_unzip/src/py_scripts/fc_quiver.py
deleted file mode 100644
index 21a5af5..0000000
--- a/FALCON_unzip/src/py_scripts/fc_quiver.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.run_quiver import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py b/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
deleted file mode 100644
index b68b43a..0000000
--- a/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.rr_hctg_track import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py b/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
deleted file mode 100644
index 0e85055..0000000
--- a/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.select_reads_from_bam import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py b/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
deleted file mode 100644
index f84dc30..0000000
--- a/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
+++ /dev/null
@@ -1,398 +0,0 @@
-#from pypeflow.common import *
-#from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
-#from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
-#from pypeflow.controller import PypeWorkflow, PypeMPWorkflow, PypeThreadWorkflow
-from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
- makePypeLocalFile, fn, PypeTask)
-PypeThreadTaskBase = MyFakePypeThreadTaskBase
-from falcon_kit.FastaReader import FastaReader
-from falcon_kit.fc_asm_graph import AsmGraph
-import glob
-import sys
-import subprocess as sp
-import shlex
-import os
-
-def make_dirs(d):
- if not os.path.isdir(d):
- os.makedirs(d)
-
-rawread_dir = os.path.abspath( "./0-rawreads" )
-pread_dir = os.path.abspath( "./1-preads_ovl" )
-asm_dir = os.path.abspath( os.path.join("./3-unzip/") )
-
-read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
-make_dirs(read_map_dir)
-
-wf = PypeProcWatcherWorkflow(
- max_jobs=12,
-)
-
-rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
-rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) )
-
- at PypeTask( inputs = {"rawread_db": rawread_db},
- outputs = {"rawread_id_file": rawread_id_file},
- TaskType = PypeThreadTaskBase,
- URL = "task://localhost/dump_rawread_ids" )
-def dump_rawread_ids(self):
- rawread_db = fn( self.rawread_db )
- rawread_id_file = fn( self.rawread_id_file )
- os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )
-
-wf.addTask( dump_rawread_ids )
-
-pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
-pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )
-
- at PypeTask( inputs = {"pread_db": pread_db},
- outputs = {"pread_id_file": pread_id_file},
- TaskType = PypeThreadTaskBase,
- URL = "task://localhost/dump_pread_ids" )
-def dump_pread_ids(self):
- pread_db = fn( self.pread_db )
- pread_id_file = fn( self.pread_id_file )
- os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file) )
-
-wf.addTask( dump_pread_ids )
-wf.refreshTargets() # block
-
-all_raw_las_files = {}
-for las_fn in glob.glob( os.path.join( rawread_dir, "m*/raw_reads.*.las") ):
- idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
- idx = int(idx.split(".")[1])
- las_file = makePypeLocalFile( las_fn )
- all_raw_las_files["r_las_%s" % idx] = las_file
-
-all_pread_las_files = {}
-for las_fn in glob.glob( os.path.join( pread_dir, "m*/preads.*.las") ):
- idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
- idx = int(idx.split(".")[1])
- las_file = makePypeLocalFile( las_fn )
- all_pread_las_files["p_las_%s" % idx] = las_file
-
-
-
-h_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_edges") )
-p_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_p_ctg_edges") )
-h_ctg_ids = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_ids") )
-
-inputs = { "rawread_id_file": rawread_id_file,
- "pread_id_file": pread_id_file,
- "h_ctg_edges": h_ctg_edges,
- "p_ctg_edges": p_ctg_edges,
- "h_ctg_ids": h_ctg_ids}
-
-read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map") )
-
- at PypeTask( inputs = inputs,
- outputs = {"read_to_contig_map": read_to_contig_map},
- TaskType = PypeThreadTaskBase,
- URL = "task://localhost/get_ctg_read_map" )
-
-def generate_read_to_ctg_map(self):
- rawread_id_file = fn( self.rawread_id_file )
- pread_id_file = fn( self.pread_id_file )
- read_to_contig_map = fn( self.read_to_contig_map )
-
- pread_did_to_rid = open(pread_id_file).read().split("\n")
- rid_to_oid = open(rawread_id_file).read().split("\n")
-
- h_ctg_edges = fn( self.h_ctg_edges )
- p_ctg_edges = fn( self.p_ctg_edges )
-
- h_ctg_ids = set()
- with open(fn(self.h_ctg_ids)) as f:
- for row in f:
- row = row.strip()
- h_ctg_ids.add( row )
-
- pread_to_contigs = {}
-
- for fnanme in ( p_ctg_edges, h_ctg_edges):
- with open(fnanme) as f:
- for row in f:
- row = row.strip().split()
- ctg = row[0]
- if len(ctg.split("_")) > 1 and ctg not in h_ctg_ids:
- continue
- n1 = row[1]
- n2 = row[2]
- pid1 = int(n1.split(":")[0])
- pid2 = int(n2.split(":")[0])
- rid1 = pread_did_to_rid[pid1].split("/")[1]
- rid2 = pread_did_to_rid[pid2].split("/")[1]
- rid1 = int(int(rid1)/10)
- rid2 = int(int(rid2)/10)
- oid1 = rid_to_oid[rid1]
- oid2 = rid_to_oid[rid2]
- k1 = (pid1, rid1, oid1)
- pread_to_contigs.setdefault( k1, set() )
- pread_to_contigs[ k1 ].add( ctg )
- k2 = (pid2, rid2, oid2)
- pread_to_contigs.setdefault( k2, set() )
- pread_to_contigs[ k2 ].add( ctg )
-
- with open(read_to_contig_map, "w") as f:
- for k in pread_to_contigs:
- pid, rid, oid = k
- for ctg in list(pread_to_contigs[ k ]):
- print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
-
-wf.addTask( generate_read_to_ctg_map )
-
-def dump_rawread_to_ctg(self):
- rawread_db = fn(self.rawread_db)
- rawread_id_file = fn(self.rawread_id_file)
- phased_read_file = fn(self.phased_reads)
- las_file = fn(self.las_file)
- rawread_to_contig_file = fn(self.rawread_to_contig_file)
- read_to_contig_map = fn(self.read_to_contig_map)
- rid_to_oid = open(rawread_id_file).read().split('\n')
-
-
- ovlp_data = []
- ovlp_count = 0
- longest_ovlp = 0
- a_id = None
- rid_to_contigs = {}
-
- with open(read_to_contig_map) as f:
- for row in f:
- row = row.strip().split()
- pid, rid, oid, ctg = row
- rid = int(rid)
- rid_to_contigs.setdefault( rid, (oid, set() ) )
- rid_to_contigs[ rid ][1].add( ctg )
-
- oid_to_phase = {}
- with open(phased_read_file) as f:
- for row in f:
- row = row.strip().split()
- ctg_id, block, phase = row[1:4]
- oid = row[6]
- block = int(block)
- phase = int(phase)
- oid_to_phase[ oid ] = (ctg_id, block, phase)
-
-
- with open(rawread_to_contig_file, "w") as f:
- ovlp_data = {}
- cur_read_id = None
- skip_rest = 0
- for row in sp.check_output(shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file)) ).splitlines():
-
- row = row.strip().split()
- t_id = int(row[1])
- q_id = int(row[0])
- if q_id != cur_read_id:
- if cur_read_id == None:
- cur_read_id = q_id
- else:
- if len(ovlp_data) == 0:
- o_id = rid_to_oid[ cur_read_id ]
- print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
- if len(ovlp_data) != 0:
- ovlp_v = ovlp_data.values()
- ovlp_v.sort()
- rank = 0
- for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
- print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
- rank += 1
- ovlp_data = {}
- cur_read_id = q_id
- skip_rest = 0
-
- if q_id in rid_to_contigs and len(ovlp_data) == 0: #if the query is already an edge of some contig....
- t_o_id, ctgs = rid_to_contigs[ q_id ]
- o_id = rid_to_oid[ q_id ]
- for ctg in list(ctgs):
- ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
- ovlp_data[ctg][0] = -int(row[7])
- ovlp_data[ctg][1] += 1
- skip_rest = 1
-
- if skip_rest == 1:
- continue
-
- if t_id not in rid_to_contigs:
- continue
-
- q_phase = oid_to_phase.get( rid_to_oid[q_id], None )
- if q_phase != None:
- ctg_id, block, phase = q_phase
- if block != -1:
- t_phase = oid_to_phase.get( rid_to_oid[t_id], None )
- if t_phase != None:
- if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
- continue
-
- t_o_id, ctgs = rid_to_contigs[ t_id ]
- o_id = rid_to_oid[ q_id ]
-
- for ctg in list(ctgs):
- ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
- ovlp_data[ctg][0] += int(row[2])
- ovlp_data[ctg][1] += 1
-
- if len(ovlp_data) != 0:
- ovlp_v = ovlp_data.values()
- ovlp_v.sort()
- rank = 0
- for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
- print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
- rank += 1
-
-def dump_pread_to_ctg(self):
- pread_db = fn( self.pread_db )
- rawread_id_file = fn( self.rawread_id_file )
- pread_id_file = fn( self.pread_id_file )
- phased_read_file = fn( self.phased_reads)
- read_to_contig_map = fn( self.read_to_contig_map )
- las_file = fn( self.las_file )
- pread_to_contig_file = fn( self.pread_to_contig_file )
- read_to_contig_map = fn( self.read_to_contig_map )
-
- pid_to_rid = open(pread_id_file).read().split("\n")
- rid_to_oid = open(rawread_id_file).read().split("\n")
-
-
- ovlp_data = []
- ovlp_count = 0
- longest_ovlp = 0
- a_id = None
- pid_to_contigs = {}
-
- with open(read_to_contig_map) as f:
- for row in f:
- row = row.strip().split()
- pid, rid, oid, ctg = row
- pid = int(pid)
- pid_to_contigs.setdefault( pid, (oid, set() ) )
- pid_to_contigs[ pid ][1].add( ctg )
-
- oid_to_phase = {}
- with open(phased_read_file) as f:
- for row in f:
- row = row.strip().split()
- ctg_id, block, phase = row[1:4]
- oid = row[6]
- block = int(block)
- phase = int(phase)
- oid_to_phase[ oid ] = (ctg_id, block, phase)
-
- with open(pread_to_contig_file, "w") as f:
- ovlp_data = {}
- cur_read_id = None
- skip_rest = 0
- for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file)) ).splitlines():
-
- row = row.strip().split()
- t_id = int(row[1])
- q_id = int(row[0])
- if q_id != cur_read_id:
- if cur_read_id == None:
- cur_read_id = q_id
- else:
- if len(ovlp_data) == 0:
- rid = pid_to_rid[cur_read_id].split("/")[1]
- rid = int(int(rid)/10)
- o_id = rid_to_oid[ rid ]
- print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
- else:
- ovlp_v = ovlp_data.values()
- ovlp_v.sort()
- rank = 0
- for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
- print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
- rank += 1
- ovlp_data = {}
- cur_read_id = q_id
- skip_rest = 0
-
- if q_id in pid_to_contigs and len(ovlp_data) == 0: #if the query is in some contig....
- t_o_id, ctgs = pid_to_contigs[ q_id ]
- rid = pid_to_rid[q_id].split("/")[1]
- rid = int(int(rid)/10)
- o_id = rid_to_oid[ rid ]
- for ctg in list(ctgs):
- ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
- ovlp_data[ctg][0] = -int(row[7])
- ovlp_data[ctg][1] += 1
- skip_rest = 1
-
- if skip_rest == 1:
- continue
-
- if t_id not in pid_to_contigs:
- continue
-
- q_rid = int( int(pid_to_rid[q_id].split("/")[1])/10 )
- q_phase = oid_to_phase.get( rid_to_oid[ q_rid ], None )
-
- if q_phase != None:
- ctg_id, block, phase = q_phase
- if block != -1:
- t_rid = int( int(pid_to_rid[t_id].split("/")[1])/10 )
- t_phase = oid_to_phase.get( rid_to_oid[ t_rid ], None )
- if t_phase != None:
- if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
- continue
-
- t_o_id, ctgs = pid_to_contigs[ t_id ]
- rid = pid_to_rid[q_id].split("/")[1]
- rid = int(int(rid)/10)
- o_id = rid_to_oid[ rid ]
-
- for ctg in list(ctgs):
- ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
- ovlp_data[ctg][0] += int(row[2])
- ovlp_data[ctg][1] += 1
-
- if len(ovlp_data) != 0:
- ovlp_v = ovlp_data.values()
- ovlp_v.sort()
- rank = 0
- for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
- print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
- rank += 1
-
-
-phased_reads = makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads"))
-
-
-for las_key, las_file in all_raw_las_files.items():
- las_fn = fn(las_file)
- idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
- idx = int(idx.split(".")[1])
- rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
- make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file,
- "rawread_db": rawread_db,
- "read_to_contig_map": read_to_contig_map,
- "rawread_id_file": rawread_id_file,
- "pread_id_file": pread_id_file,
- "phased_reads" : phased_reads},
- outputs = { "rawread_to_contig_file": rawread_to_contig_file },
- TaskType = PypeThreadTaskBase,
- URL = "task://localhost/r_read_to_contigs.%s" % idx )
- dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)
- wf.addTask( dump_rawread_to_ctg_task )
-
-for las_key, las_file in all_pread_las_files.items():
- las_fn = fn(las_file)
- idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
- idx = int(idx.split(".")[1])
- pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
- make_dump_pread_to_ctg = PypeTask( inputs = { "las_file": las_file,
- "pread_db": pread_db,
- "read_to_contig_map": read_to_contig_map,
- "rawread_id_file": rawread_id_file,
- "pread_id_file": pread_id_file,
- "phased_reads": phased_reads},
- outputs = { "pread_to_contig_file": pread_to_contig_file },
- TaskType = PypeThreadTaskBase,
- URL = "task://localhost/pread_to_contigs.%s" % idx )
- dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)
- wf.addTask( dump_pread_to_ctg_task )
-
-wf.refreshTargets() # block
diff --git a/FALCON_unzip/src/py_scripts/fc_unzip.py b/FALCON_unzip/src/py_scripts/fc_unzip.py
deleted file mode 100644
index 0e6bb71..0000000
--- a/FALCON_unzip/src/py_scripts/fc_unzip.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from falcon_unzip.unzip import main
-import sys
-if __name__ == "__main__":
- main(sys.argv)
diff --git a/FALCON_unzip/src/py_utils/align.sh b/FALCON_unzip/src/py_utils/align.sh
deleted file mode 100644
index 2bcb508..0000000
--- a/FALCON_unzip/src/py_utils/align.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-~/bin/blasr_yli_0317 000000F_reads.fa 000000F_ref.fa -noSplitSubreads -clipping subread -placeRepeatsRandomly -minPctIdentity 70.0 -minMatch 12 -nproc 48 -bam -out aln.bam
-#~/bin/blasr_yli_0317 ../data/chm1_bam/all_merged.bam pa_ctg.fa -noSplitSubreads -clipping subread -placeRepeatsRandomly -minPctIdentity 70.0 -minMatch 12 -nproc 48 -bam -out test_chm1.bam
-#samtools sort -m 120G test_chm13.bam chm13_aln
-#samtools sort -m 120G test_chm1.bam chm1_aln
-#python screen_aln_chm1.py
-#python screen_aln_chm13.py
-#bamtools merge -in chm1_aln_sampled.bam -in chm13_aln_sampled.bam -out chm1_chm13_sampled_merged.bam
-#samtools sort -m 120G chm1_chm13_sampled_merged.bam chm1_chm13_sampled_merged_sorted
-#samtools index chm1_chm13_sampled_merged_sorted.bam
-#variantCaller.py --skipUnrecognizedContigs -v -j40 -r pa_ctg.fa --algorithm=quiver chm1_chm13_sampled_merged_sorted.bam -o cns.fastq.gz -o cns.fasta.gz
diff --git a/FALCON_unzip/src/py_utils/call_SV_from_mum.py b/FALCON_unzip/src/py_utils/call_SV_from_mum.py
deleted file mode 100644
index ac8e441..0000000
--- a/FALCON_unzip/src/py_utils/call_SV_from_mum.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import sys
-
-mum_fn = sys.argv[1]
-
-def process_mum_data( pread_id, mum_data ):
- delta = 0
- pend = 0
- qend = 0
- p_delta = 0
-
-
- rtn = []
- for rpos, qpos, len_ in mum_data:
- rpos = int(rpos)
- qpos = int(qpos)
- len_ = int(len_)
- offset = rpos - qpos
-
- if pend == 0:
- rtn.append( ( rpos, qpos, len_, offset, rpos+len_, 0, 0, 0, pread_id) )
- else:
- rtn.append( ( rpos, qpos, len_, offset, rpos+len_, offset-p_offset, rpos-pend, qpos-qend, pread_id) )
- pend = rpos+len_
- qend = qpos+len_
- p_offset = offset
-
- #break into cluster
- cluster = []
- p_delta = 0
- for d in rtn:
- rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
- if q_delta < -10:
- continue
- if r_delta < -10:
- continue
- if len(cluster) == 0: #start a new cluster
- cluster.append([])
- cluster[-1].append( d )
- #p_delta = delta
- continue
- if delta > 30000:
- #print delta, p_delta
- #p_delta = 0
- cluster.append([])
- continue
- cluster[-1].append( d )
- #p_delta = delta
- rtn = []
- for cls_id in xrange(len(cluster)):
- aln_size = 0
- qpos_ary = []
- if len(cluster[cls_id]) == 0:
- continue
- for d in cluster[cls_id]:
- rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
- aln_size += len_
- qpos_ary.append(qpos)
- if 1.0*(max(qpos_ary) - min(qpos_ary))/aln_size < 0.5 or aln_size < 2000:
- continue
-
- for d in cluster[cls_id]:
- rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
- rtn.append( (rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id) )
-
-
- return rtn
-
-
-
-mum_data = []
-with open(mum_fn) as mum_file:
- for row in mum_file:
- row = row.strip().split()
- if row[0] == ">":
- if len(mum_data) > 0:
- rtn = process_mum_data( pread_id, mum_data )
- if len(rtn) > 0:
- print ">",pread_id
- for d in rtn:
- print " ".join([str(c) for c in d])
- pread_id = row[1]
- mum_data = []
- #print
- else:
- mum_data.append( row )
diff --git a/FALCON_unzip/src/py_utils/fetch_read.py b/FALCON_unzip/src/py_utils/fetch_read.py
deleted file mode 100644
index 74def7e..0000000
--- a/FALCON_unzip/src/py_utils/fetch_read.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from falcon_kit.FastaReader import FastaReader
-
-import sys
-
-f = FastaReader(sys.argv[1])
-rl = set(open(sys.argv[2]).read().split())
-for r in f:
- rid = r.name.split()[0]
- if rid not in rl:
- continue
- print ">"+rid
- print r.sequence
diff --git a/FALCON_unzip/src/py_utils/split_bam.py b/FALCON_unzip/src/py_utils/split_bam.py
deleted file mode 100644
index f5562f0..0000000
--- a/FALCON_unzip/src/py_utils/split_bam.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import subprocess, shlex
-
-
-read_to_phase = {}
-with open("phased_reads") as f:
- for l in f:
- l = l.strip().split()
- rid = l[-1]
- phase = l[1] + "_" +l[2]
- read_to_phase[rid] = phase
-
-p = subprocess.Popen(shlex.split("samtools view -h 000207F.000207F000207F.bam" ), stdout=subprocess.PIPE)
-
-phase_to_file = {}
-header = []
-for l in p.stdout:
- l2 = l.strip().split()
- if l2[0][0] == "@":
- header.append(l.strip())
- QNAME = l2[0]
- phase = read_to_phase.get(QNAME, None)
- if phase != None:
- pfn = ("./phased_sam/%s" % phase) +".sam"
- if pfn not in phase_to_file:
- phase_to_file[pfn] = open(pfn,"w")
- print >> phase_to_file[pfn], "\n".join(header)
- print >> phase_to_file[pfn], l.strip()
-
diff --git a/pypeFLOW/pwatcher/fs_based.py b/pypeFLOW/pwatcher/fs_based.py
index e22639f..1a203a8 100755
--- a/pypeFLOW/pwatcher/fs_based.py
+++ b/pypeFLOW/pwatcher/fs_based.py
@@ -286,12 +286,14 @@ class MetaJobSge(object):
#cwd = os.getcwd()
job_name = self.get_jobname()
sge_option = qstripped(self.mjob.job.options['sge_option'])
- job_queue = self.mjob.job.options['job_queue']
+ if '-q' not in sge_option:
+ job_queue = self.mjob.job.options['job_queue']
+ sge_option = '-q {} '.format(job_queue) + sge_option
# Add shebang, in case shell_start_mode=unix_behavior.
# https://github.com/PacificBiosciences/FALCON/pull/348
with open(script_fn, 'r') as original: data = original.read()
with open(script_fn, 'w') as modified: modified.write("#!/bin/bash" + "\n" + data)
- sge_cmd = 'qsub -N {job_name} -q {job_queue} {sge_option} {specific} -cwd -o stdout -e stderr -S {exe} {script_fn}'.format(
+ sge_cmd = 'qsub -N {job_name} {sge_option} {specific} -cwd -o stdout -e stderr -S {exe} {script_fn}'.format(
**locals())
system(sge_cmd, checked=True) # TODO: Capture q-jobid
def kill(self, state, heartbeat):
@@ -330,12 +332,14 @@ usage: qsub [-a date_time] [-A account_string] [-c interval]
#cwd = os.getcwd()
job_name = self.get_jobname()
sge_option = qstripped(self.mjob.job.options['sge_option'])
- job_queue = self.mjob.job.options['job_queue']
+ if '-q' not in sge_option:
+ job_queue = self.mjob.job.options['job_queue']
+ sge_option = '-q {} '.format(job_queue) + sge_option
# Add shebang, in case shell_start_mode=unix_behavior.
# https://github.com/PacificBiosciences/FALCON/pull/348
with open(script_fn, 'r') as original: data = original.read()
with open(script_fn, 'w') as modified: modified.write("#!/bin/bash" + "\n" + data)
- sge_cmd = 'qsub -N {job_name} -q {job_queue} {sge_option} {specific} -o stdout -e stderr -S {exe} {script_fn}'.format(
+ sge_cmd = 'qsub -N {job_name} {sge_option} {specific} -o stdout -e stderr -S {exe} {script_fn}'.format(
**locals())
system(sge_cmd, checked=True) # TODO: Capture q-jobid
def kill(self, state, heartbeat):
@@ -367,13 +371,15 @@ class MetaJobTorque(object):
#cwd = os.getcwd()
job_name = self.get_jobname()
sge_option = qstripped(self.mjob.job.options['sge_option'])
- job_queue = self.mjob.job.options['job_queue']
+ if '-q' not in sge_option:
+ job_queue = self.mjob.job.options['job_queue']
+ sge_option = '-q {} '.format(job_queue) + sge_option
cwd = os.getcwd()
# Add shebang, in case shell_start_mode=unix_behavior.
# https://github.com/PacificBiosciences/FALCON/pull/348
with open(script_fn, 'r') as original: data = original.read()
with open(script_fn, 'w') as modified: modified.write("#!/bin/bash" + "\n" + data)
- sge_cmd = 'qsub -N {job_name} -q {job_queue} {sge_option} {specific} -d {cwd} -o stdout -e stderr -S {exe} {script_fn}'.format(
+ sge_cmd = 'qsub -N {job_name} {sge_option} {specific} -d {cwd} -o stdout -e stderr -S {exe} {script_fn}'.format(
**locals())
system(sge_cmd, checked=True) # TODO: Capture q-jobid
def kill(self, state, heartbeat):
@@ -402,9 +408,11 @@ class MetaJobSlurm(object):
"""
job_name = self.get_jobname()
sge_option = qstripped(self.mjob.job.options['sge_option'])
- job_queue = self.mjob.job.options['job_queue']
+ if '-p' not in sge_option:
+ job_queue = self.mjob.job.options['job_queue']
+ sge_option = '-p {} '.format(job_queue) + sge_option
cwd = os.getcwd()
- sge_cmd = 'sbatch -J {job_name} -p {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
+ sge_cmd = 'sbatch -J {job_name} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
**locals())
# "By default all environment variables are propagated."
# http://slurm.schedmd.com/sbatch.html
@@ -434,8 +442,10 @@ class MetaJobLsf(object):
"""
job_name = self.get_jobname()
sge_option = qstripped(self.mjob.job.options['sge_option'])
- job_queue = self.mjob.job.options['job_queue']
- sge_cmd = 'bsub -J {job_name} -q {job_queue} {sge_option} -o stdout -e stderr "{exe} {script_fn}"'.format(
+ if '-q' not in sge_option:
+ job_queue = self.mjob.job.options['job_queue']
+ sge_option = '-q {} '.format(job_queue) + sge_option
+ sge_cmd = 'bsub -J {job_name} {sge_option} -o stdout -e stderr "{exe} {script_fn}"'.format(
**locals())
# "Sets the user's execution environment for the job, including the current working directory, file creation mask, and all environment variables, and sets LSF environment variables before starting the job."
system(sge_cmd, checked=True) # TODO: Capture q-jobid
diff --git a/pypeFLOW/pypeflow/do_task.py b/pypeFLOW/pypeflow/do_task.py
index 3e9a868..1366059 100644
--- a/pypeFLOW/pypeflow/do_task.py
+++ b/pypeFLOW/pypeflow/do_task.py
@@ -131,6 +131,10 @@ def run(json_fn, timeout, tmpdir):
LOG.debug('Loading JSON from {!r}'.format(json_fn))
cfg = json.loads(open(json_fn).read())
LOG.debug(pprint.pformat(cfg))
+ rundir = os.path.dirname(json_fn)
+ with cd(rundir):
+ run_cfg_in_tmpdir(cfg, tmpdir)
+def run_cfg_in_tmpdir(cfg, tmpdir):
for fn in cfg['inputs'].values():
wait_for(fn)
inputs = cfg['inputs']
diff --git a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
index e47bb76..5dc4c60 100644
--- a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
+++ b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
@@ -64,7 +64,6 @@ class PwatcherTaskQueue(object):
#sge_option='-pe smp 8 -q default'
for node in nodes:
#node.satisfy() # This would do the job without a process-watcher.
- mkdirs(node.wdir)
generated_script_fn = node.execute() # misnomer; this only dumps task.json now
if not generated_script_fn:
raise Exception('Missing generated_script_fn for Node {}'.format(node))
@@ -156,14 +155,29 @@ def get_unsatisfied_subgraph(g):
if n.satisfied():
unsatg.remove_node(n)
return unsatg
-def find_next_ready(g, node):
+def find_all_ancestors(g):
+ ancestors = dict()
+ def get_ancestors(node):
+ """memoized"""
+ if node not in ancestors:
+ myancestors = set()
+ for pred in g.predecessors_iter(node):
+ myancestors.update(get_ancestors(pred))
+ ancestors[node] = myancestors
+ return ancestors[node]
+ for node in g.nodes():
+ get_ancestors(node)
+ return ancestors
+def find_next_ready_and_remove(g, node):
"""Given a recently satisfied node,
return any successors with in_degree 1.
+ Then remove node from g immediately.
"""
ready = set()
for n in g.successors_iter(node):
if 1 == g.in_degree(n):
ready.add(n)
+ g.remove_node(node)
return ready
def find_all_roots(g):
"""Find all nodes in g which have no predecessors.
@@ -267,8 +281,7 @@ class Workflow(object):
LOG.info('recently_satisfied: {!r}'.format(recently_satisfied))
LOG.info('Num satisfied in this iteration: {}'.format(len(recently_satisfied)))
for node in recently_satisfied:
- ready.update(find_next_ready(unsatg, node))
- unsatg.remove_node(node)
+ ready.update(find_next_ready_and_remove(unsatg, node))
LOG.info('Num still unsatisfied: {}'.format(len(unsatg)))
if recently_done:
msg = 'Some tasks are recently_done but not satisfied: {!r}'.format(recently_done)
@@ -283,7 +296,7 @@ class Workflow(object):
raise Exception('We had {} failures. {} tasks remain unsatisfied.'.format(
failures, len(unsatg)))
- def __init__(self, watcher, job_type, job_queue, max_jobs, use_tmpdir, sge_option=''):
+ def __init__(self, watcher, job_type, job_queue, max_jobs, use_tmpdir, squash, sge_option=''):
self.graph = networkx.DiGraph()
self.tq = PwatcherTaskQueue(watcher=watcher, job_type=job_type, job_queue=job_queue, sge_option=sge_option) # TODO: Inject this.
assert max_jobs > 0, 'max_jobs needs to be set. If you use the "blocking" process-watcher, it is also the number of threads.'
@@ -291,6 +304,8 @@ class Workflow(object):
self.sentinels = dict() # sentinel_done_fn -> Node
self.pypetask2node = dict()
self.use_tmpdir = use_tmpdir
+ self.squash = squash # This really should depend on each Task, but for now a global is fine.
+ # For small genomes, serial tasks should always be squashed.
class NodeBase(object):
"""Graph node.
@@ -353,22 +368,15 @@ touch {sentinel_done_fn}
self.name = name
self.wdir = wdir
self.needs = needs
-class ComboNode(NodeBase):
- """Several Nodes to be executed in sequence.
- Only this ComboNode will be in the DiGraph, not the sub-Nodes.
- """
- def generate_script(self):
- raise NotImplementedError(repr(self))
- def __init__(self, nodes):
- #super(ComboNode, self).__init__(name, wdir, needs)
- self.nodes = nodes
class PypeNode(NodeBase):
"""
We will clean this up later. For now, it is pretty tightly coupled to PypeTask.
"""
def generate_script(self):
+ wdir = self.wdir
+ mkdirs(wdir)
pt = self.pypetask
- wdir = pt.wdir # For now, assume dir already exists.
+ assert pt.wdir == self.wdir
inputs = {k:v.path for k,v in pt.inputs.items()}
outputs = {k:os.path.relpath(v.path, wdir) for k,v in pt.outputs.items()}
for v in outputs.values():
@@ -391,7 +399,7 @@ env | sort
pwd
time {cmd}
""".format(**locals())
- script_fn = os.path.join(pt.wdir, 'task.sh')
+ script_fn = os.path.join(wdir, 'task.sh')
open(script_fn, 'w').write(script_content)
return script_fn
def old_generate_script(self):
@@ -481,6 +489,13 @@ def PypeTask(inputs, outputs, parameters=None, wdir=None):
if wdir is None:
#wdir = parameters.get('wdir', name) # One of these must be a string!
wdir = find_work_dir([only_path(v) for v in outputs.values()])
+ # Since we derived wdir from outputs, we need to ensure that they
+ # are not relative to wdir.
+ for k,v in outputs.items():
+ if not isinstance(v, PypeLocalFile) and not os.path.isabs(v):
+ outputs[k] = os.path.relpath(v, wdir)
+ if not os.path.isabs(wdir):
+ wdir = os.path.abspath(wdir)
this = _PypeTask(inputs, outputs, wdir, parameters)
#basedir = os.path.basename(wdir)
basedir = this.name
@@ -533,7 +548,7 @@ class _PypeTask(object):
URL = 'task://localhost/{}'.format(name)
self.inputs = inputs
self.outputs = outputs
- self.parameters = parameters
+ self.parameters = dict(parameters) # Always copy this, so caller can re-use, for convenience.
self.wdir = wdir
self.name = name
self.URL = URL
@@ -562,6 +577,7 @@ def PypeProcWatcherWorkflow(
max_jobs = 24, # must be > 0, but not too high
sge_option = None,
use_tmpdir = None,
+ squash = False,
**attributes):
"""Factory for the workflow.
"""
@@ -580,8 +596,13 @@ def PypeProcWatcherWorkflow(
use_tmpdir = os.path.abspath(use_tmpdir)
except (TypeError, AttributeError):
use_tmpdir = tempfile.gettempdir()
- LOG.info('job_type={!r}, job_queue={!r}, sge_option={!r}, use_tmpdir={!r}'.format(job_type, job_queue, sge_option, use_tmpdir))
- return Workflow(watcher, job_type=job_type, job_queue=job_queue, max_jobs=max_jobs, sge_option=sge_option, use_tmpdir=use_tmpdir)
+ LOG.info('job_type={!r}, job_queue={!r}, sge_option={!r}, use_tmpdir={!r}, squash={!r}'.format(
+ job_type, job_queue, sge_option, use_tmpdir, squash,
+ ))
+ return Workflow(watcher,
+ job_type=job_type, job_queue=job_queue, max_jobs=max_jobs, sge_option=sge_option, use_tmpdir=use_tmpdir,
+ squash=squash,
+ )
#th = MyPypeFakeThreadsHandler('mypwatcher', job_type, job_queue)
#mq = MyMessageQueue()
#se = MyFakeShutdownEvent() # TODO: Save pwatcher state on ShutdownEvent. (Not needed for blocking pwatcher. Mildly useful for fs_based.)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falcon.git
More information about the debian-med-commit
mailing list