[med-svn] [Git][med-team/cutesv][upstream] New upstream version 2.0.3
Andreas Tille (@tille)
gitlab at salsa.debian.org
Wed Jul 12 09:01:38 BST 2023
Andreas Tille pushed to branch upstream at Debian Med / cutesv
Commits:
42a9fad2 by Andreas Tille at 2023-07-12T09:38:39+02:00
New upstream version 2.0.3
- - - - -
10 changed files:
- README.md
- setup.py
- + src/benchmarks/eval_BND.py
- + src/benchmarks/eval_generegion.py
- + src/benchmarks/eval_population.py
- src/cuteSV/cuteSV
- src/cuteSV/cuteSV_Description.py
- src/cuteSV/cuteSV_genotype.py
- src/cuteSV/cuteSV_resolveDUP.py
- + src/documentation/README.md
Changes:
=====================================
README.md
=====================================
@@ -44,6 +44,8 @@ For more detailed implementation of SV benchmarks, we show an example [here](htt
## Notice
A new wiki page about diploid-assembly-based SV detection using cuteSV has been established. More details please see [here](https://github.com/tjiangHIT/cuteSV/wiki/Diploid-assembly-based-SV-detection-using-cuteSV).
+We provided a new document for applying **force calling** (or **regenotyping**) benchmark [here](https://github.com/tjiangHIT/cuteSV/tree/master/src/documentation).
+
---
### Dependence
@@ -77,6 +79,8 @@ A new wiki page about diploid-assembly-based SV detection using cuteSV has been
--diff_ratio_merging_INS 0.3
--max_cluster_bias_DEL 100
--diff_ratio_merging_DEL 0.3
+ > For force calling:
+ --min_mapq 10
| Parameter | Description | Default |
| :------------ |:---------------|-------------:|
@@ -105,6 +109,7 @@ A new wiki page about diploid-assembly-based SV detection using cuteSV has been
|--max_cluster_bias_TRA|Maximum distance to cluster read together for translocation.|50|
|--diff_ratio_filtering_TRA|Filter breakpoints with basepair identity less than the ratio of *default* for translocation.|0.6|
|--remain_reads_ratio|The ratio of reads remained in cluster to generate the breakpoint. Set lower to get more precise breakpoint when the alignment data have high quality but recommand over 0.5.|1|
+|-include_bed|Optional given bed file. Only detect SVs in regions in the BED file.|NULL|
---
### Datasets generated from cuteSV
@@ -117,6 +122,10 @@ Please cite the manuscript of cuteSV before using these callsets.
---
### Changelog
+ cuteSV (v2.0.3):
+ 1. Fix the error of missing min_size parameter.
+ 2. Fix the missing signatures in duplication clustering.
+
cuteSV (v2.0.2):
1. Fix several errors in signature extraction.
2. Filter low quality reads in the statistics of reference reads.
=====================================
setup.py
=====================================
@@ -7,7 +7,7 @@ with open('README.md') as f:
setup(
name = "cuteSV",
- version = "2.0.2",
+ version = "2.0.3",
description = "Long-read-based human genomic structural variation detection with cuteSV",
author = "Jiang Tao",
author_email = "tjiang at hit.edu.cn",
=====================================
src/benchmarks/eval_BND.py
=====================================
@@ -0,0 +1,130 @@
+import sys
+import argparse
+import logging
+import time
+
+def pase_info(seq):
+ info = {'SVLEN': 0, 'END': 0, "SVTYPE": '', "RE": 0, "CHR2": ''}
+ for i in seq.split(';'):
+ if i.split('=')[0] in ["SVLEN", "END", "RE"]:
+ try:
+ info[i.split('=')[0]] = abs(int(float(i.split('=')[1])))
+ except:
+ pass
+ if i.split('=')[0] in ["CHR2"]:
+ info[i.split('=')[0]] = i.split('=')[1]
+ if i.split('=')[0] in ["SVTYPE"]:
+ info[i.split('=')[0]] = i.split('=')[1][0:3]
+
+ return info
+
+def phase_GT(seq):
+ i = seq.split(':')
+ if i[0] in ['0/1', '1/0']:
+ return 'het'
+ elif i[0] == '1/1':
+ return 'hom'
+ else:
+ return 'unknown'
+
+def load_callset(path):
+ callset = dict()
+ file = open(path, 'r')
+ for line in file:
+ seq = line.strip('\n').split('\t')
+ if seq[0][0] == '#':
+ continue
+
+ chr = seq[0]
+ pos = int(seq[1])
+ info = pase_info(seq[7])
+ if info['SVTYPE'] == "TRA":
+ info['SVTYPE'] = "BND"
+
+ if info['SVTYPE'] == "BND":
+ if seq[4][0] == ']':
+ form = ']]N'
+ chr2 = seq[4].split(':')[0][1:]
+ pos2 = int(seq[4].split(':')[1][:-2])
+ elif seq[4][0] == '[':
+ form = '[[N'
+ chr2 = seq[4].split(':')[0][1:]
+ pos2 = int(seq[4].split(':')[1][:-2])
+ else:
+ if seq[4][1] == ']':
+ form = 'N]]'
+ chr2 = seq[4].split(':')[0][2:]
+ pos2 = int(seq[4].split(':')[1][:-1])
+ else:
+ form = 'N[['
+ chr2 = seq[4].split(':')[0][2:]
+ pos2 = int(seq[4].split(':')[1][:-1])
+ if info['SVTYPE'] not in callset:
+ callset[info['SVTYPE']] = list()
+ if info['END'] == 0:
+ info['CHR2'] = chr2
+ info['END'] = pos2
+ try:
+ if int(chr) <= int(info['CHR2']):
+ if form == 'N[[':
+ form = ']]N'
+ if form == ']]N':
+ form = 'N[['
+ callset[info['SVTYPE']].append([chr, pos, info['CHR2'], info['END'], form, phase_GT(seq[9]), 0])
+ else:
+ callset[info['SVTYPE']].append([info['CHR2'], info['END'], chr, pos, form, phase_GT(seq[9]), 0])
+ except:
+ callset[info['SVTYPE']].append([chr, pos, info['CHR2'], info['END'], form, phase_GT(seq[9]), 0])
+
+ file.close()
+ return callset
+
+def eval(call, ans, offect):
+ tpcall = 0
+ for i in call["BND"]:
+ for j in ans["BND"]:
+ if i[0] == j[0] and i[2] == j[2] and abs(i[1]-j[1]) <= offect and abs(i[3]-j[3]) <= offect:
+ tpcall += 1
+ break
+ fp = len(call["BND"]) - tpcall
+ return fp, len(call["BND"])
+
+def main_ctrl(args):
+ # load ground truth set
+ base_list = load_callset(args.base)
+ comp_list = load_callset(args.comp)
+
+ fp, total = eval(comp_list, base_list, args.offect)
+ logging.info('False positive in BND: %d'%(fp))
+ logging.info('Total amount of BND: %d'%(total))
+
+def main(argv):
+ args = parseArgs(argv)
+ setupLogging(False)
+ # print args
+ starttime = time.time()
+ main_ctrl(args)
+ logging.info("Finished in %0.2f seconds."%(time.time() - starttime))
+
+USAGE="""\
+ Evaluate SV callset generated by simulations.
+ Author: Shuqi Cao
+ Email: sqcao at stu.hit.edu.cn
+"""
+
+def parseArgs(argv):
+ parser = argparse.ArgumentParser(prog="evaluation on BND", description=USAGE, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument("base", type=str, help="Ground truth of BNDs.")
+ parser.add_argument("comp", type=str, help="BND callsets to be benched.")
+ parser.add_argument('-o', '--offect', help = "Offect of translocation overlaping.[%(default)s]", default = 1000, type = int)
+ args = parser.parse_args(argv)
+ return args
+
+def setupLogging(debug=False):
+ logLevel = logging.DEBUG if debug else logging.INFO
+ logFormat = "%(asctime)s [%(levelname)s] %(message)s"
+ logging.basicConfig( stream=sys.stderr, level=logLevel, format=logFormat )
+ logging.info("Running %s" % " ".join(sys.argv))
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
\ No newline at end of file
=====================================
src/benchmarks/eval_generegion.py
=====================================
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# coding=utf-8
+import sys
+import argparse
+import joblib
+import logging
+import time
+
+# python gene_anno.py *.anno.jl *.vcf *.related.vcf *.intergenetic.vcf
+def main_ctrl(args):
+ annos = joblib.load(args.anno)
+ length = len(annos)
+ pos_list = []
+ for i in range(length):
+ to_check = annos.loc[i]
+ start_pos = int(to_check["vcf_key"].split(':')[1].split('-')[0]) + 1 # corresponding pos in vcf
+ if len(pos_list) == 0 or pos_list[-1] != start_pos:
+ pos_list.append(start_pos)
+ gene_related = open(args.gene, 'w')
+ intergenetic = open(args.intergenetic, 'w')
+ idx = 0
+ with open(args.vcf, 'r') as f:
+ for line in f:
+ if line[0] == '#':
+ gene_related.write(line)
+ intergenetic.write(line)
+ else:
+ pos = int(line.strip().split('\t')[1])
+ if idx < len(pos_list) and pos == pos_list[idx]:
+ gene_related.write(line)
+ idx += 1
+ else:
+ intergenetic.write(line)
+ logging.info('Finish')
+
+# truvari anno bpovl -i LGY.vcf.gz -a changed.gtf.gz -o LGY.anno.jl -p gff
+def main(argv):
+ args = parseArgs(argv)
+ setupLogging(False)
+ starttime = time.time()
+ main_ctrl(args)
+ logging.info("Finished in %0.2f seconds."%(time.time() - starttime))
+
+USAGE="""\
+ Evaluate SV callset generated by simulations.
+ Author: Shuqi Cao
+ Email: sqcao at stu.hit.edu.cn
+"""
+
+def parseArgs(argv):
+ parser = argparse.ArgumentParser(prog="evaluation on HWE/ExcHet/Hete/Known worldwide cohort", description=USAGE, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument("--anno", type=str, help="Gene region annotation from truvari anno bpovl.")
+ parser.add_argument("--vcf", type=str, help="VCF file to be extracted.")
+ parser.add_argument("--gene", type=str, help="SVs in gene-related regions.")
+ parser.add_argument("--intergenetic", type=str, help="SVs in intergenetic regions.")
+ args = parser.parse_args(argv)
+ return args
+
+def setupLogging(debug=False):
+ logLevel = logging.DEBUG if debug else logging.INFO
+ logFormat = "%(asctime)s [%(levelname)s] %(message)s"
+ logging.basicConfig( stream=sys.stderr, level=logLevel, format=logFormat )
+ logging.info("Running %s" % " ".join(sys.argv))
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
\ No newline at end of file
=====================================
src/benchmarks/eval_population.py
=====================================
@@ -0,0 +1,178 @@
+import sys
+import argparse
+import logging
+import time
+
+def hwe_exchet(population_vcf):
+ output = open('hwe_exchet.csv', 'w')
+ output.write('MissingRate\tHWE\tExcHet\n')
+ hwe_list = [0 for i in range(22)]
+ exchet_list = [0 for i in range(22)]
+ with open(population_vcf) as f:
+ for line in f:
+ if line[0] == '#':
+ continue
+ else:
+ seq = line.strip().split('\t')
+ ac = int(seq[7].split(';AC=')[1].split(';')[0])
+ hwe = float(seq[7].split(';HWE=')[1].split(';')[0])
+ exchet = float(seq[7].split(';ExcHet=')[1])
+ if ac == 0:
+ hwe_list[20] += 1
+ exchet_list[20] += 1
+ continue
+ missing_cnt = 0
+ for i in range(9, 109, 1):
+ if seq[i][0] == '.':
+ missing_cnt += 1
+ if seq[i][2] == '.':
+ missing_cnt += 1
+ if missing_cnt > 10:
+ hwe_list[21] += 1
+ exchet_list[21] += 1
+ for i in range(20):
+ if i / 20 < hwe <= (i + 1) / 20:
+ hwe_list[i] += 1
+ if i / 20 < exchet <= (i + 1) / 20:
+ exchet_list[i] += 1
+ output.write('%f\t%f\t%f\n'%(missing_cnt / 200, hwe, exchet))
+
+def hete(population_vcf):
+ output = open('hete.csv', 'w')
+ output.write('AF\tHete\n')
+ with open(population_vcf) as f:
+ for line in f:
+ if line[0] == '#':
+ continue
+ else:
+ seq = line.strip().split('\t')
+ try:
+ af = float(seq[7].split(';AF=')[1].split(';')[0])
+ except:
+ ac = 0
+ an = 0
+ for i in range(9, 109, 1):
+ if seq[i][0] == '1':
+ ac += 1
+ an += 1
+ if seq[i][2] == '1':
+ ac += 1
+ an += 1
+ if seq[i][0] == '0':
+ an += 1
+ if seq[i][2] == '0':
+ an += 1
+ if an == 0:
+ af = 0.0
+ else:
+ af = ac / an
+ if af <= 0.0001:
+ continue
+ hete = 0
+ for i in range(9, 109, 1):
+ if seq[i][0] == '0' and seq[i][2] == '1':
+ hete += 1
+ output.write('%f\t%f\n'%(af, hete / 100))
+
+def compare(base_file, comp_file):
+ def parse(file):
+ svs_dict = dict()
+ with open(file, 'r') as f:
+ for line in f:
+ if line[0] == '#':
+ continue
+ seq = line.strip().split('\t')
+ chrom = seq[0]
+ pos = int(seq[1])
+ svtype = seq[7].split('SVTYPE=')[1].split(';')[0]
+ if svtype == 'DEL' or svtype == 'INS':
+ svlen = abs(int(seq[7].split('SVLEN=')[1].split(';')[0]))
+ af = float(seq[7].split(';AF=')[1].split(';')[0])
+ if chrom not in svs_dict:
+ svs_dict[chrom] = list()
+ svs_dict[chrom].append([pos, svtype, svlen, af])
+ return svs_dict
+ base = parse(base_file)
+ comp = parse(comp_file)
+ ans = dict()
+ output = open('difference.csv', 'w')
+ result_5 = []
+ pos_bias = 1000
+ length_ratio = 0.7
+ for chrom in base:
+ if chrom in comp:
+ ans[chrom] = list()
+ for basesv in base[chrom]:
+ for compsv in comp[chrom]:
+ if basesv[1] == compsv[1] and abs(basesv[0] - compsv[0]) <= pos_bias and min(basesv[2], compsv[2]) / max(basesv[2], compsv[2]) > length_ratio:
+ ans[chrom].append((basesv[1], basesv[0], compsv[0], basesv[3], compsv[3]))
+ output.write('%s\t%f\t%f\t%f\n'%(basesv[1], basesv[3], compsv[3], basesv[3]-compsv[3]))
+ if abs(basesv[3] - compsv[3]) > 0.5:
+ result_5.append([basesv[1], chrom, basesv[0]])
+ break
+ return result_5
+
+def eval_known_callset(sniffles, sniffles2, cutesv, base):
+ sniffles = compare(base, sniffles)
+ sniffles2 = compare(base, sniffles2)
+ cutesv = compare(base, cutesv)
+ ss2 = 0
+ sc = 0
+ s2c = 0
+ ss2c = 0
+ for item1 in sniffles:
+ for item2 in sniffles2:
+ if item1[1] == item2[1] and item1[0] == item2[0] and item1[2] == item2[2]:
+ ss2 += 1
+ for item2 in cutesv:
+ if item1[1] == item2[1] and item1[0] == item2[0] and item1[2] == item2[2]:
+ sc += 1
+ for item1 in cutesv:
+ for item2 in sniffles2:
+ if item1[1] == item2[1] and item1[0] == item2[0] and item1[2] == item2[2]:
+ s2c += 1
+ for item3 in sniffles:
+ if item1[1] == item3[1] and item1[0] == item3[0] and item1[2] == item3[2]:
+ ss2c += 1
+ logging.info('Sniffles1: %d\tSniffles2: %d\t cuteSV2: %d'%(len(sniffles), len(sniffles2), len(cutesv)))
+ logging.info('Sniffles1&Sniffes2: %d\tSniffles1&cuteSV2: %d\tSniffles2&cuteSV2: %d'%(ss2, sc, s2c))
+ logging.info('Sniffles1&Sniffes2&cuteSV2: %d'%(ss2c))
+
+def main(argv):
+ args = parseArgs(argv)
+ setupLogging(False)
+ starttime = time.time()
+ if args.handle == 'HWE' or args.handle == 'ExcHet':
+ hwe_exchet(args.comp)
+ if args.handle == 'Hete':
+ hete(args.comp)
+ if args.handle == 'Known':
+ eval_known_callset(args.sniffles, args.sniffles2, args.cutesv, args.base)
+ logging.info("Finished in %0.2f seconds."%(time.time() - starttime))
+
+USAGE="""\
+ Evaluate SV callset generated by simulations.
+ Author: Shuqi Cao
+ Email: sqcao at stu.hit.edu.cn
+"""
+
+def parseArgs(argv):
+ parser = argparse.ArgumentParser(prog="evaluation on HWE/ExcHet/Hete/Known worldwide cohort", description=USAGE, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument("handle", type=str, help="The aspect of evaluation, contains HWE/ExcHet/Hete/Known.")
+ parser.add_argument("--comp", type=str, help="BND callsets to be benched.")
+ parser.add_argument("--base", type=str, help="Ground truth of BNDs.")
+ parser.add_argument("--sniffles", type=str, help="Callsets of Sniffles1.")
+ parser.add_argument("--sniffles2", type=str, help="Callsets of Sniffles2.")
+ parser.add_argument("--cutesv", type=str, help="Callsets of cuteSV2.")
+ parser.add_argument('-o', '--offect', help = "Offect of translocation overlaping.[%(default)s]", default = 1000, type = int)
+ args = parser.parse_args(argv)
+ return args
+
+def setupLogging(debug=False):
+ logLevel = logging.DEBUG if debug else logging.INFO
+ logFormat = "%(asctime)s [%(levelname)s] %(message)s"
+ logging.basicConfig( stream=sys.stderr, level=logLevel, format=logFormat )
+ logging.info("Running %s" % " ".join(sys.argv))
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
=====================================
src/cuteSV/cuteSV
=====================================
@@ -4,8 +4,8 @@
* All rights Reserved, Designed By HIT-Bioinformatics
* @Title: cuteSV
* @author: tjiang & sqcao
- * @date: Sep 12th 2022
- * @version V2.0.2
+ * @date: May 12th 2023
+ * @version V2.0.3
'''
import pysam
@@ -19,7 +19,7 @@ from cuteSV.cuteSV_resolveINV import run_inv
from cuteSV.cuteSV_resolveTRA import run_tra
from cuteSV.cuteSV_resolveINDEL import run_ins, run_del
from cuteSV.cuteSV_resolveDUP import run_dup
-from cuteSV.cuteSV_genotype import generate_output, generate_pvcf, load_valuable_chr
+from cuteSV.cuteSV_genotype import generate_output, generate_pvcf, load_valuable_chr, load_bed
from cuteSV.cuteSV_forcecalling import force_calling_chrom
import os
import argparse
@@ -610,7 +610,7 @@ def parse_read(read, Chr_name, SV_size, min_mapq, max_split_parts, min_read_len,
return candidate
def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, temp_dir,
- task, min_siglength, merge_del_threshold, merge_ins_threshold, MaxSize):
+ task, min_siglength, merge_del_threshold, merge_ins_threshold, MaxSize, bed_regions):
candidate = list()
reads_info_list = list()
@@ -618,7 +618,19 @@ def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, t
samfile = pysam.AlignmentFile(sam_path)
for read in samfile.fetch(Chr_name, task[1], task[2]):
- if read.reference_start >= task[1]:
+ pos_start = read.reference_start # 0-based
+ pos_end = read.reference_end
+ in_bed = False
+ if bed_regions != None:
+ for bed_region in bed_regions:
+ if pos_end <= bed_region[0] or pos_start >= bed_region[1]:
+ continue
+ else:
+ in_bed = True
+ break
+ else:
+ in_bed = True
+ if read.reference_start >= task[1] and in_bed:
read_candidate = parse_read(read, Chr_name, min_length, min_mapq, max_split_parts,
min_read_len, min_siglength, merge_del_threshold,
merge_ins_threshold, MaxSize)
@@ -627,8 +639,6 @@ def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, t
is_primary = 0
if read.flag in [0, 16]:
is_primary = 1
- pos_start = read.reference_start # 0-based
- pos_end = read.reference_end
reads_info_list.append([pos_start, pos_end, is_primary, read.query_name])
samfile.close()
# print('finish %s:%d-%d in %f seconds.'%(task[0], task[1], len(reads_info_list), time.time() - start_time))
@@ -661,7 +671,6 @@ def single_pipe(sam_path, min_length, min_mapq, max_split_parts, min_read_len, t
logging.info("Finished %s:%d-%d."%(Chr_name, task[1], task[2]))
gc.collect()
-
def multi_run_wrapper(args):
return single_pipe(*args)
@@ -698,21 +707,23 @@ def main_ctrl(args, argv):
pos += args.batches
if pos < local_ref_len:
Task_list.append([i[0], pos, local_ref_len])
+ bed_regions = load_bed(args.include_bed, Task_list)
#'''
analysis_pools = Pool(processes=int(args.threads))
os.mkdir("%ssignatures"%temporary_dir)
- for i in Task_list:
+ for i in range(len(Task_list)):
para = [(args.input,
args.min_size,
args.min_mapq,
args.max_split_parts,
args.min_read_len,
temporary_dir,
- i,
+ Task_list[i],
args.min_siglength,
args.merge_del_threshold,
args.merge_ins_threshold,
- args.max_size)]
+ args.max_size,
+ None if bed_regions == None else bed_regions[i])]
analysis_pools.map_async(multi_run_wrapper, para)
analysis_pools.close()
analysis_pools.join()
=====================================
src/cuteSV/cuteSV_Description.py
=====================================
@@ -2,12 +2,12 @@
* All rights Reserved, Designed By HIT-Bioinformatics
* @Title: cuteSV_Description.py
* @author: tjiang & sqcao
- * @date: Sep 12th 2022
- * @version V2.0.2
+ * @date: May 12th 2023
+ * @version V2.0.3
'''
import argparse
-VERSION = '2.0.2'
+VERSION = '2.0.3'
class cuteSVdp(object):
'''
@@ -120,6 +120,10 @@ def parseArgs(argv):
help = "Maximum distance of insertion signals to be merged. In our paper, I used -mi 500 to process HG002 real human sample data.[%(default)s]",
default = 100,
type = int)
+ GroupSignaturesCollect.add_argument('-include_bed',
+ help = "Optional given bed file. Only detect SVs in regions in the BED file. [NULL]",
+ default = None,
+ type = str)
# The min_read_len in last version is 2000.
# signatures with overlap need to be filtered
=====================================
src/cuteSV/cuteSV_genotype.py
=====================================
@@ -8,81 +8,81 @@ prior = float(1/3)
Genotype = ["0/0", "0/1", "1/1"]
def log10sumexp(log10_probs):
- # Normalization of Genotype likelihoods
- m = max(log10_probs)
- return m + log10(sum(pow(10.0, x-m) for x in log10_probs))
+ # Normalization of Genotype likelihoods
+ m = max(log10_probs)
+ return m + log10(sum(pow(10.0, x-m) for x in log10_probs))
def normalize_log10_probs(log10_probs):
- # Adjust the Genotype likelihoods
- log10_probs = np.array(log10_probs)
- lse = log10sumexp(log10_probs)
- return np.minimum(log10_probs - lse, 0.0)
+ # Adjust the Genotype likelihoods
+ log10_probs = np.array(log10_probs)
+ lse = log10sumexp(log10_probs)
+ return np.minimum(log10_probs - lse, 0.0)
def rescale_read_counts(c0, c1, max_allowed_reads=100):
- """Ensures that n_total <= max_allowed_reads, rescaling if necessary."""
- Total = c0 + c1
- if Total > max_allowed_reads:
- c0 = int(max_allowed_reads * float(c0/Total))
- c1 = max_allowed_reads - c0
- return c0, c1
+ """Ensures that n_total <= max_allowed_reads, rescaling if necessary."""
+ Total = c0 + c1
+ if Total > max_allowed_reads:
+ c0 = int(max_allowed_reads * float(c0/Total))
+ c1 = max_allowed_reads - c0
+ return c0, c1
def cal_GL(c0, c1):
- # Approximate adjustment of events with larger read depth
- c0, c1 = rescale_read_counts(c0, c1)
- # original genotype likelihood
- # ori_GL00 = np.float64(pow((1-err), c0)*pow(err, c1)*comb(c0+c1,c0)*(1-prior)/2)
- # ori_GL11 = np.float64(pow(err, c0)*pow((1-err), c1)*comb(c0+c1,c0)*(1-prior)/2)
- # ori_GL01 = np.float64(pow(0.5, c0+c1)*comb(c0+c1,c0)*prior)
-
- ori_GL00 = np.float64(pow((1-err), c0)*pow(err, c1)*(1-prior)/2)
- ori_GL11 = np.float64(pow(err, c0)*pow((1-err), c1)*(1-prior)/2)
- ori_GL01 = np.float64(pow(0.5, c0+c1)*prior)
+ # Approximate adjustment of events with larger read depth
+ c0, c1 = rescale_read_counts(c0, c1)
+ # original genotype likelihood
+ # ori_GL00 = np.float64(pow((1-err), c0)*pow(err, c1)*comb(c0+c1,c0)*(1-prior)/2)
+ # ori_GL11 = np.float64(pow(err, c0)*pow((1-err), c1)*comb(c0+c1,c0)*(1-prior)/2)
+ # ori_GL01 = np.float64(pow(0.5, c0+c1)*comb(c0+c1,c0)*prior)
+
+ ori_GL00 = np.float64(pow((1-err), c0)*pow(err, c1)*(1-prior)/2)
+ ori_GL11 = np.float64(pow(err, c0)*pow((1-err), c1)*(1-prior)/2)
+ ori_GL01 = np.float64(pow(0.5, c0+c1)*prior)
- # normalized genotype likelihood
- prob = list(normalize_log10_probs([log10(ori_GL00), log10(ori_GL01), log10(ori_GL11)]))
- GL_P = [pow(10, i) for i in prob]
- PL = [int(np.around(-10*log10(i))) for i in GL_P]
- GQ = [int(-10*log10(GL_P[1] + GL_P[2])), int(-10*log10(GL_P[0] + GL_P[2])), int(-10*log10(GL_P[0] + GL_P[1]))]
- QUAL = abs(np.around(-10*log10(GL_P[0]), 1))
+ # normalized genotype likelihood
+ prob = list(normalize_log10_probs([log10(ori_GL00), log10(ori_GL01), log10(ori_GL11)]))
+ GL_P = [pow(10, i) for i in prob]
+ PL = [int(np.around(-10*log10(i))) for i in GL_P]
+ GQ = [int(-10*log10(GL_P[1] + GL_P[2])), int(-10*log10(GL_P[0] + GL_P[2])), int(-10*log10(GL_P[0] + GL_P[1]))]
+ QUAL = abs(np.around(-10*log10(GL_P[0]), 1))
- return Genotype[prob.index(max(prob))], "%d,%d,%d"%(PL[0], PL[1], PL[2]), max(GQ), QUAL
+ return Genotype[prob.index(max(prob))], "%d,%d,%d"%(PL[0], PL[1], PL[2]), max(GQ), QUAL
def cal_CIPOS(std, num):
- pos = int(1.96 * std / num ** 0.5)
- return "-%d,%d"%(pos,pos)
+ pos = int(1.96 * std / num ** 0.5)
+ return "-%d,%d"%(pos,pos)
def threshold_ref_count(num):
- if num <= 2:
- return 20*num
- elif 3 <= num <= 5:
- return 9*num
- elif 6 <= num <= 15:
- return 7*num
- else:
- return 5*num
+ if num <= 2:
+ return 20*num
+ elif 3 <= num <= 5:
+ return 9*num
+ elif 6 <= num <= 15:
+ return 7*num
+ else:
+ return 5*num
def count_coverage(chr, s, e, f, read_count, up_bound, itround):
- status = 0
- iteration = 0
- primary_num = 0
- for i in f.fetch(chr, s, e):
- iteration += 1
- if i.flag not in [0,16]:
- continue
- primary_num += 1
- if i.reference_start < s and i.reference_end > e:
- read_count.add(i.query_name)
- if len(read_count) >= up_bound:
- status = 1
- break
- if iteration >= itround:
- if float(primary_num/iteration) <= 0.2:
- status = 1
- else:
- status = -1
- break
+ status = 0
+ iteration = 0
+ primary_num = 0
+ for i in f.fetch(chr, s, e):
+ iteration += 1
+ if i.flag not in [0,16]:
+ continue
+ primary_num += 1
+ if i.reference_start < s and i.reference_end > e:
+ read_count.add(i.query_name)
+ if len(read_count) >= up_bound:
+ status = 1
+ break
+ if iteration >= itround:
+ if float(primary_num/iteration) <= 0.2:
+ status = 1
+ else:
+ status = -1
+ break
- return status
+ return status
def overlap_cover(svs_list, reads_list):
# [(10024, 12024), (89258, 91258), ...]
@@ -195,177 +195,179 @@ def duipai(svs_list, reads_list, iteration_dict, primary_num_dict, cover2_dict):
print('Correct iteration %d'%(correct_num))
def generate_output(args, semi_result, contigINFO, argv, ref_g):
-
- '''
- Generation of VCF format file.
- VCF version: 4.2
- '''
+
+ '''
+ Generation of VCF format file.
+ VCF version: 4.2
+ '''
- # genotype_trigger = TriggerGT[args.genotype]
+ # genotype_trigger = TriggerGT[args.genotype]
- svid = dict()
- svid["INS"] = 0
- svid["DEL"] = 0
- svid["BND"] = 0
- svid["DUP"] = 0
- svid["INV"] = 0
+ svid = dict()
+ svid["INS"] = 0
+ svid["DEL"] = 0
+ svid["BND"] = 0
+ svid["DUP"] = 0
+ svid["INV"] = 0
- file = open(args.output, 'w')
- action = args.genotype
- Generation_VCF_header(file, contigINFO, args.sample, argv)
- file.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n"%(args.sample))
- for i in semi_result:
- if i[1] in ["DEL", "INS"]:
- if abs(int(float(i[3]))) > args.max_size and args.max_size != -1:
- continue
- if i[1] == "INS":
- cal_end = int(i[2])
- else:
- cal_end = int(i[2]) + abs(int(float(i[3])))
- info_list = "{PRECISION};SVTYPE={SVTYPE};SVLEN={SVLEN};END={END};CIPOS={CIPOS};CILEN={CILEN};RE={RE};RNAMES={RNAMES}".format(
- PRECISION = "IMPRECISE" if i[8] == "0/0" else "PRECISE",
- SVTYPE = i[1],
- SVLEN = i[3],
- END = str(cal_end),
- CIPOS = i[5],
- CILEN = i[6],
- RE = i[4],
- RNAMES = i[12] if args.report_readid else "NULL")
- if action:
- try:
- info_list += ";AF=" + str(round(int(i[4]) / (int(i[4]) + int(i[7])), 4))
- except:
- info_list += ";AF=."
- if i[1] =="DEL":
- info_list += ";STRAND=+-"
- if i[11] == "." or i[11] == None:
- filter_lable = "PASS"
- else:
- filter_lable = "PASS" if float(i[11]) >= 5.0 else "q5"
- file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
- CHR = i[0],
- POS = str(int(i[2])),
- ID = "cuteSV.%s.%d"%(i[1], svid[i[1]]),
- REF = str(ref_g[i[0]].seq[max(int(i[2])-1, 0)]) if i[1] == 'INS' else str(ref_g[i[0]].seq[max(int(i[2])-1, 0):int(i[2])-int(i[3])]),
- ALT = "%s"%(str(ref_g[i[0]].seq[max(int(i[2])-1, 0)])+i[13] if i[1] == 'INS' else str(ref_g[i[0]].seq[max(int(i[2])-1, 0)])),
- INFO = info_list,
- FORMAT = "GT:DR:DV:PL:GQ",
- GT = i[8],
- DR = i[7],
- RE = i[4],
- PL = i[9],
- GQ = i[10],
- QUAL = i[11],
- PASS = filter_lable))
- svid[i[1]] += 1
- elif i[1] == "DUP":
- if abs(int(float(i[3]))) > args.max_size and args.max_size != -1:
- continue
- cal_end = int(i[2]) + 1 + abs(int(float(i[3])))
- info_list = "{PRECISION};SVTYPE={SVTYPE};SVLEN={SVLEN};END={END};RE={RE};STRAND=-+;RNAMES={RNAMES}".format(
- PRECISION = "IMPRECISE" if i[6] == "0/0" else "PRECISE",
- SVTYPE = i[1],
- SVLEN = i[3],
- END = str(cal_end),
- RE = i[4],
- RNAMES = i[10] if args.report_readid else "NULL")
- if action:
- try:
- info_list += ";AF=" + str(round(int(i[4]) / (int(i[4]) + int(i[5])), 4))
- except:
- info_list += ";AF=."
- if i[9] == ".":
- filter_lable = "PASS"
- else:
- filter_lable = "PASS" if float(i[9]) >= 5.0 else "q5"
- file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
- CHR = i[0],
- POS = str(int(i[2]) + 1),
- ID = "cuteSV.%s.%d"%(i[1], svid[i[1]]),
- REF = str(ref_g[i[0]].seq[int(i[2])]),
- ALT = "<%s>"%(i[1]),
- INFO = info_list,
- FORMAT = "GT:DR:DV:PL:GQ",
- GT = i[6],
- DR = i[5],
- RE = i[4],
- PL = i[7],
- GQ = i[8],
- QUAL = i[9],
- PASS = filter_lable))
- svid[i[1]] += 1
- elif i[1] == "INV":
- if abs(int(float(i[3]))) > args.max_size and args.max_size != -1:
- continue
- cal_end = int(i[2]) + 1 + abs(int(float(i[3])))
- info_list = "{PRECISION};SVTYPE={SVTYPE};SVLEN={SVLEN};END={END};RE={RE};STRAND={STRAND};RNAMES={RNAMES}".format(
- PRECISION = "IMPRECISE" if i[6] == "0/0" else "PRECISE",
- SVTYPE = i[1],
- SVLEN = i[3],
- END = str(cal_end),
- RE = i[4],
- STRAND = i[7],
- RNAMES = i[11] if args.report_readid else "NULL")
- if action:
- try:
- info_list += ";AF=" + str(round(int(i[4]) / (int(i[4]) + int(i[5])), 4))
- except:
- info_list += ";AF=."
- if i[10] == ".":
- filter_lable = "PASS"
- else:
- filter_lable = "PASS" if float(i[10]) >= 5.0 else "q5"
- file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
- CHR = i[0],
- POS = str(int(i[2]) + 1),
- ID = "cuteSV.%s.%d"%(i[1], svid[i[1]]),
- REF = str(ref_g[i[0]].seq[int(i[2])]),
- ALT = "<%s>"%(i[1]),
- INFO = info_list,
- FORMAT = "GT:DR:DV:PL:GQ",
- GT = i[6],
- DR = i[5],
- RE = i[4],
- PL = i[8],
- GQ = i[9],
- QUAL = i[10],
- PASS = filter_lable))
- svid[i[1]] += 1
- else:
- # BND
- # info_list = "{PRECISION};SVTYPE={SVTYPE};CHR2={CHR2};END={END};RE={RE};RNAMES={RNAMES}".format(
- info_list = "{PRECISION};SVTYPE={SVTYPE};RE={RE};RNAMES={RNAMES}".format(
- PRECISION = "IMPRECISE" if i[7] == "0/0" else "PRECISE",
- SVTYPE = "BND",
- # CHR2 = i[3],
- # END = str(int(i[4]) + 1),
- RE = i[5],
- RNAMES = i[11] if args.report_readid else "NULL")
- if action:
- try:
- info_list += ";AF=" + str(round(int(i[5]) / (int(i[5]) + int(i[6])), 4))
- except:
- info_list += ";AF=."
- if i[10] == ".":
- filter_lable = "PASS"
- else:
- filter_lable = "PASS" if float(i[10]) >= 5.0 else "q5"
- file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
- CHR = i[0],
- POS = str(int(i[2]) + 1),
- ID = "cuteSV.%s.%d"%("BND", svid["BND"]),
- REF = 'N',
- ALT = i[1],
- INFO = info_list,
- FORMAT = "GT:DR:DV:PL:GQ",
- GT = i[7],
- DR = i[6],
- RE = i[5],
- PL = i[8],
- GQ = i[9],
- QUAL = i[10],
- PASS = filter_lable))
- svid["BND"] += 1
+ file = open(args.output, 'w')
+ action = args.genotype
+ Generation_VCF_header(file, contigINFO, args.sample, argv)
+ file.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n"%(args.sample))
+ for i in semi_result:
+ if i[1] in ["DEL", "INS"]:
+ if abs(int(float(i[3]))) > args.max_size and args.max_size != -1:
+ continue
+ if abs(int(float(i[3]))) < args.min_size:
+ continue
+ if i[1] == "INS":
+ cal_end = int(i[2])
+ else:
+ cal_end = int(i[2]) + abs(int(float(i[3])))
+ info_list = "{PRECISION};SVTYPE={SVTYPE};SVLEN={SVLEN};END={END};CIPOS={CIPOS};CILEN={CILEN};RE={RE};RNAMES={RNAMES}".format(
+ PRECISION = "IMPRECISE" if i[8] == "0/0" else "PRECISE",
+ SVTYPE = i[1],
+ SVLEN = i[3],
+ END = str(cal_end),
+ CIPOS = i[5],
+ CILEN = i[6],
+ RE = i[4],
+ RNAMES = i[12] if args.report_readid else "NULL")
+ if action:
+ try:
+ info_list += ";AF=" + str(round(int(i[4]) / (int(i[4]) + int(i[7])), 4))
+ except:
+ info_list += ";AF=."
+ if i[1] =="DEL":
+ info_list += ";STRAND=+-"
+ if i[11] == "." or i[11] == None:
+ filter_lable = "PASS"
+ else:
+ filter_lable = "PASS" if float(i[11]) >= 5.0 else "q5"
+ file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
+ CHR = i[0],
+ POS = str(int(i[2])),
+ ID = "cuteSV.%s.%d"%(i[1], svid[i[1]]),
+ REF = str(ref_g[i[0]].seq[max(int(i[2])-1, 0)]) if i[1] == 'INS' else str(ref_g[i[0]].seq[max(int(i[2])-1, 0):int(i[2])-int(i[3])]),
+ ALT = "%s"%(str(ref_g[i[0]].seq[max(int(i[2])-1, 0)])+i[13] if i[1] == 'INS' else str(ref_g[i[0]].seq[max(int(i[2])-1, 0)])),
+ INFO = info_list,
+ FORMAT = "GT:DR:DV:PL:GQ",
+ GT = i[8],
+ DR = i[7],
+ RE = i[4],
+ PL = i[9],
+ GQ = i[10],
+ QUAL = i[11],
+ PASS = filter_lable))
+ svid[i[1]] += 1
+ elif i[1] == "DUP":
+ if abs(int(float(i[3]))) > args.max_size and args.max_size != -1:
+ continue
+ cal_end = int(i[2]) + 1 + abs(int(float(i[3])))
+ info_list = "{PRECISION};SVTYPE={SVTYPE};SVLEN={SVLEN};END={END};RE={RE};STRAND=-+;RNAMES={RNAMES}".format(
+ PRECISION = "IMPRECISE" if i[6] == "0/0" else "PRECISE",
+ SVTYPE = i[1],
+ SVLEN = i[3],
+ END = str(cal_end),
+ RE = i[4],
+ RNAMES = i[10] if args.report_readid else "NULL")
+ if action:
+ try:
+ info_list += ";AF=" + str(round(int(i[4]) / (int(i[4]) + int(i[5])), 4))
+ except:
+ info_list += ";AF=."
+ if i[9] == ".":
+ filter_lable = "PASS"
+ else:
+ filter_lable = "PASS" if float(i[9]) >= 5.0 else "q5"
+ file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
+ CHR = i[0],
+ POS = str(int(i[2]) + 1),
+ ID = "cuteSV.%s.%d"%(i[1], svid[i[1]]),
+ REF = str(ref_g[i[0]].seq[int(i[2])]),
+ ALT = "<%s>"%(i[1]),
+ INFO = info_list,
+ FORMAT = "GT:DR:DV:PL:GQ",
+ GT = i[6],
+ DR = i[5],
+ RE = i[4],
+ PL = i[7],
+ GQ = i[8],
+ QUAL = i[9],
+ PASS = filter_lable))
+ svid[i[1]] += 1
+ elif i[1] == "INV":
+ if abs(int(float(i[3]))) > args.max_size and args.max_size != -1:
+ continue
+ cal_end = int(i[2]) + 1 + abs(int(float(i[3])))
+ info_list = "{PRECISION};SVTYPE={SVTYPE};SVLEN={SVLEN};END={END};RE={RE};STRAND={STRAND};RNAMES={RNAMES}".format(
+ PRECISION = "IMPRECISE" if i[6] == "0/0" else "PRECISE",
+ SVTYPE = i[1],
+ SVLEN = i[3],
+ END = str(cal_end),
+ RE = i[4],
+ STRAND = i[7],
+ RNAMES = i[11] if args.report_readid else "NULL")
+ if action:
+ try:
+ info_list += ";AF=" + str(round(int(i[4]) / (int(i[4]) + int(i[5])), 4))
+ except:
+ info_list += ";AF=."
+ if i[10] == ".":
+ filter_lable = "PASS"
+ else:
+ filter_lable = "PASS" if float(i[10]) >= 5.0 else "q5"
+ file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
+ CHR = i[0],
+ POS = str(int(i[2]) + 1),
+ ID = "cuteSV.%s.%d"%(i[1], svid[i[1]]),
+ REF = str(ref_g[i[0]].seq[int(i[2])]),
+ ALT = "<%s>"%(i[1]),
+ INFO = info_list,
+ FORMAT = "GT:DR:DV:PL:GQ",
+ GT = i[6],
+ DR = i[5],
+ RE = i[4],
+ PL = i[8],
+ GQ = i[9],
+ QUAL = i[10],
+ PASS = filter_lable))
+ svid[i[1]] += 1
+ else:
+ # BND
+ # info_list = "{PRECISION};SVTYPE={SVTYPE};CHR2={CHR2};END={END};RE={RE};RNAMES={RNAMES}".format(
+ info_list = "{PRECISION};SVTYPE={SVTYPE};RE={RE};RNAMES={RNAMES}".format(
+ PRECISION = "IMPRECISE" if i[7] == "0/0" else "PRECISE",
+ SVTYPE = "BND",
+ # CHR2 = i[3],
+ # END = str(int(i[4]) + 1),
+ RE = i[5],
+ RNAMES = i[11] if args.report_readid else "NULL")
+ if action:
+ try:
+ info_list += ";AF=" + str(round(int(i[5]) / (int(i[5]) + int(i[6])), 4))
+ except:
+ info_list += ";AF=."
+ if i[10] == ".":
+ filter_lable = "PASS"
+ else:
+ filter_lable = "PASS" if float(i[10]) >= 5.0 else "q5"
+ file.write("{CHR}\t{POS}\t{ID}\t{REF}\t{ALT}\t{QUAL}\t{PASS}\t{INFO}\t{FORMAT}\t{GT}:{DR}:{RE}:{PL}:{GQ}\n".format(
+ CHR = i[0],
+ POS = str(int(i[2]) + 1),
+ ID = "cuteSV.%s.%d"%("BND", svid["BND"]),
+ REF = 'N',
+ ALT = i[1],
+ INFO = info_list,
+ FORMAT = "GT:DR:DV:PL:GQ",
+ GT = i[7],
+ DR = i[6],
+ RE = i[5],
+ PL = i[8],
+ GQ = i[9],
+ QUAL = i[10],
+ PASS = filter_lable))
+ svid["BND"] += 1
def generate_pvcf(args, result, contigINFO, argv, ref_g):
@@ -376,8 +378,8 @@ def generate_pvcf(args, result, contigINFO, argv, ref_g):
for i in result:
if i == []:
continue
- if i[7][5] == '.,.':
- print(i)
+ # if i[7][5] == '.,.':
+ # print(i)
if i[7][5] == "." or i[7][5] == None:
filter_lable = "PASS"
else:
@@ -558,37 +560,60 @@ def generate_pvcf(args, result, contigINFO, argv, ref_g):
GQ = i[7][4]
))
-
+
def load_valuable_chr(path):
- valuable_chr = dict()
- valuable_chr["DEL"] = list()
- valuable_chr["DUP"] = list()
- valuable_chr["INS"] = list()
- valuable_chr["INV"] = list()
- valuable_chr["TRA"] = dict()
+ valuable_chr = dict()
+ valuable_chr["DEL"] = list()
+ valuable_chr["DUP"] = list()
+ valuable_chr["INS"] = list()
+ valuable_chr["INV"] = list()
+ valuable_chr["TRA"] = dict()
- for svtype in ["DEL", "DUP", "INS", "INV"]:
- file = open("%s%s.sigs"%(path, svtype), 'r')
- for line in file:
- chr = line.strip('\n').split('\t')[1]
- if chr not in valuable_chr[svtype]:
- valuable_chr[svtype].append(chr)
- file.close()
- valuable_chr[svtype].sort()
+ for svtype in ["DEL", "DUP", "INS", "INV"]:
+ file = open("%s%s.sigs"%(path, svtype), 'r')
+ for line in file:
+ chr = line.strip('\n').split('\t')[1]
+ if chr not in valuable_chr[svtype]:
+ valuable_chr[svtype].append(chr)
+ file.close()
+ valuable_chr[svtype].sort()
- file = open("%s%s.sigs"%(path, "TRA"), 'r')
- for line in file:
- chr1 = line.strip('\n').split('\t')[1]
- chr2 = line.strip('\n').split('\t')[4]
-
- if chr1 not in valuable_chr["TRA"]:
- valuable_chr["TRA"][chr1] = list()
- if chr2 not in valuable_chr["TRA"][chr1]:
- valuable_chr["TRA"][chr1].append(chr2)
+ file = open("%s%s.sigs"%(path, "TRA"), 'r')
+ for line in file:
+ chr1 = line.strip('\n').split('\t')[1]
+ chr2 = line.strip('\n').split('\t')[4]
+
+ if chr1 not in valuable_chr["TRA"]:
+ valuable_chr["TRA"][chr1] = list()
+ if chr2 not in valuable_chr["TRA"][chr1]:
+ valuable_chr["TRA"][chr1].append(chr2)
- file.close()
- for chr1 in valuable_chr["TRA"]:
- valuable_chr["TRA"][chr1].sort()
+ file.close()
+ for chr1 in valuable_chr["TRA"]:
+ valuable_chr["TRA"][chr1].sort()
- return valuable_chr
+ return valuable_chr
+def load_bed(bed_file, Task_list):
+ # Task_list: [[chr, start, end], ...]
+ bed_regions = dict()
+ if bed_file != None:
+ # only consider regions in BED file
+ with open(bed_file, 'r') as f:
+ for line in f:
+ seq = line.strip().split('\t')
+ if seq[0] not in bed_regions:
+ bed_regions[seq[0]] = list()
+ bed_regions[seq[0]].append((int(seq[1]) - 1000, int(seq[2]) + 1000))
+ region_list = [[] for i in range(len(Task_list))]
+ for chrom in bed_regions:
+ bed_regions[chrom].sort(key = lambda x:(x[0], x[1]))
+ for item in bed_regions[chrom]:
+ for i in range(len(Task_list)):
+ if chrom == Task_list[i][0]:
+ if (Task_list[i][1] <= item[0] and Task_list[i][2] > item[0]) or item[0] <= Task_list[i][1] < item[1]:
+ region_list[i].append(item)
+ assert len(region_list) == len(Task_list), "parse bed file error"
+ return region_list
+ else:
+ return None
\ No newline at end of file
=====================================
src/cuteSV/cuteSV_resolveDUP.py
=====================================
@@ -29,7 +29,8 @@ def resolution_DUP(path, chr, read_count, max_cluster_bias, sv_size,
pos_2 = int(seq[3])
read_id = seq[4]
- if pos_1 - semi_dup_cluster[-1][0] > max_cluster_bias or pos_2 - semi_dup_cluster[-1][1] > max_cluster_bias:
+ # if pos_1 - semi_dup_cluster[-1][0] > max_cluster_bias or pos_2 - semi_dup_cluster[-1][1] > max_cluster_bias:
+ if pos_1 - semi_dup_cluster[-1][0] > max_cluster_bias:
if len(semi_dup_cluster) >= read_count:
if semi_dup_cluster[-1][0] == semi_dup_cluster[-1][1] == 0:
pass
@@ -81,39 +82,52 @@ def generate_dup_cluster(semi_dup_cluster, chr, read_count, max_cluster_bias,
if len(support_read) < read_count:
return
- low_b = int(len(semi_dup_cluster)*0.4)
- up_b = int(len(semi_dup_cluster)*0.6)
+ semi_dup_cluster.sort(key = lambda x:x[1])
+ allele_collect = []
+ allele_collect.append([semi_dup_cluster[0]])
+ last_len = semi_dup_cluster[0][1]
+ for i in semi_dup_cluster[1:]:
+ if i[1] - last_len > max_cluster_bias:
+ allele_collect.append([])
+ allele_collect[-1].append(i)
+ last_len = i[1]
+ for i in allele_collect:
+ support_read = list(set([j[2] for j in i]))
+ if len(support_read) < read_count:
+ continue
+ low_b = int(len(i)*0.4)
+ up_b = int(len(i)*0.6)
- if low_b == up_b:
- breakpoint_1 = semi_dup_cluster[low_b][0]
- breakpoint_2 = semi_dup_cluster[low_b][1]
- else:
- breakpoint_1 = [i[0] for i in semi_dup_cluster[low_b:up_b]]
- breakpoint_2 = [i[1] for i in semi_dup_cluster[low_b:up_b]]
- breakpoint_1 = int(sum(breakpoint_1)/len(semi_dup_cluster[low_b:up_b]))
- breakpoint_2 = int(sum(breakpoint_2)/len(semi_dup_cluster[low_b:up_b]))
+ if low_b == up_b:
+ breakpoint_1 = i[low_b][0]
+ breakpoint_2 = i[low_b][1]
+ else:
+ breakpoint_1 = [i[0] for i in i[low_b:up_b]]
+ breakpoint_2 = [i[1] for i in i[low_b:up_b]]
+ breakpoint_1 = int(sum(breakpoint_1)/len(i[low_b:up_b]))
+ breakpoint_2 = int(sum(breakpoint_2)/len(i[low_b:up_b]))
- if sv_size <= breakpoint_2 - breakpoint_1 <= MaxSize or (sv_size <= breakpoint_2 - breakpoint_1 and MaxSize == -1):
- if action:
- candidate_single_SV.append([chr,
- 'DUP',
- breakpoint_1,
- breakpoint_2,
- support_read])
- # print("DUP", chr, int(breakpoint_1), int(breakpoint_2), DR, DV, QUAL, "%.4f"%cost_time)
- else:
- candidate_single_SV.append([chr,
- 'DUP',
- str(breakpoint_1),
- str(breakpoint_2 - breakpoint_1),
- str(len(support_read)),
- '.',
- './.',
- '.,.,.',
- '.',
- '.',
- str(','.join(support_read))])
+ if sv_size <= breakpoint_2 - breakpoint_1 <= MaxSize or (sv_size <= breakpoint_2 - breakpoint_1 and MaxSize == -1):
+ if action:
+ candidate_single_SV.append([chr,
+ 'DUP',
+ breakpoint_1,
+ breakpoint_2,
+ support_read])
+ # print("DUP", chr, int(breakpoint_1), int(breakpoint_2), DR, DV, QUAL, "%.4f"%cost_time)
+ else:
+ candidate_single_SV.append([chr,
+ 'DUP',
+ str(breakpoint_1),
+ str(breakpoint_2 - breakpoint_1),
+ str(len(support_read)),
+ '.',
+ './.',
+ '.,.,.',
+ '.',
+ '.',
+ str(','.join(support_read))])
def run_dup(args):
=====================================
src/documentation/README.md
=====================================
@@ -0,0 +1,162 @@
+# Force Calling Benchmark
+
+We give a demo to help users of performing the regenotyping and evaluating performance. Here, we apply the integration VCF file from the callsets of HG002, HG003, and HG004 (all from GiaB Ashkenazim Trio) as the cohort-level SV target, then the HG002 alignments are selected to complete regenotyping on the SV target mentioned before.
+The following procedures show the steps of applying regenotyping and reproducing the benchmark results.
+
+# Get tools
+
+Information about how to install `conda` and add the `bioconda` channel is available on https://bioconda.github.io/.
+
+```sh
+conda create -n sniffles1_env python=3
+conda activate sniffles1_env
+conda install sniffles==1.0.12
+```
+```sh
+conda create -n test_fc python=3
+conda activate test_fc
+conda install sniffles==2.0.2 cuteSV==2.0.2 svjedi==1.1.6 truvari==3.2.0 samtools tabix
+# It will cost approximately 30 seconds to install cuteSV2.
+```
+
+# Get data
+1) Create directory structure:
+```sh
+conda activate test_fc
+mkdir -p ref alns tools/{sniffles1,sniffles2,cutesv,svjedi} giab
+```
+
+2) Download NIST and CMRG ground truth:
+```sh
+FTPDIR=ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/analysis
+curl -s ${FTPDIR}/NIST_SVs_Integration_v0.6/HG002_SVs_Tier1_v0.6.bed > giab/HG002_SVs_Tier1_v0.6.bed
+curl -s ${FTPDIR}/NIST_SVs_Integration_v0.6/HG002_SVs_Tier1_v0.6.vcf.gz > giab/HG002_SVs_Tier1_v0.6.vcf.gz
+```
+```sh
+FTPDIR=ftp://ftp.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG002_NA24385_son/CMRG_v1.00/GRCh37/StructuralVariant
+curl -s ${FTPDIR}/HG002_GRCh37_CMRG_SV_v1.00.bed > giab/HG002_GRCh37_CMRG_SV_v1.00.bed
+curl -s ${FTPDIR}/HG002_GRCh37_CMRG_SV_v1.00.vcf.gz > giab/HG002_GRCh37_CMRG_SV_v1.00.vcf.gz
+```
+
+3) Download hg19 reference with decoys and map non-ACGT characters to N:
+```sh
+curl -s ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz > ref/human_hs37d5.fasta.gz
+gunzip ref/human_hs37d5.fasta.gz
+sed -i '/^[^>]/ y/BDEFHIJKLMNOPQRSUVWXYZbdefhijklmnopqrsuvwxyz/NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN/' ref/human_hs37d5.fasta
+```
+
+4) Download all `.bam` files:
+```sh
+curl -s ftp://trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/PacBio_CCS_15kb/alignment/HG002.Sequel.15kb.pbmm2.hs37d5.whatshap.haplotag.RTG.10x.trio.bam > alns/HG002_origin.bam
+samtools calmd -b alns/HG002_origin.bam ref/human_hs37d5.fasta > alns/HG002_all.bam
+samtools index alns/HG002_all.bam
+```
+
+5) Download the trio vcf file:
+```sh
+curl -s https://zenodo.org/record/7347467/files/trio.vcf > trio.vcf
+```
+
+# Run sniffles1
+
+6a) Run sniffles1 (v1.0.12):
+```sh
+conda activate sniffles1_env
+sniffles -m alns/HG002_all.bam -v tools/sniffles1/sniffles1.call.vcf --Ivcf trio.vcf
+conda deactivate
+```
+6b) Prepare for truvari:
+```sh
+grep '#' tools/sniffles1/sniffles1.call.vcf > tools/sniffles1/sniffles1.sort.vcf
+grep -v '#' tools/sniffles1/sniffles1.call.vcf | sort -k 1,1 -k 2,2n >> tools/sniffles1/sniffles1.sort.vcf
+grep '#' tools/sniffles1/sniffles1.sort.vcf > tools/sniffles1/sniffles1.vcf
+grep -v '#' tools/sniffles1/sniffles1.sort.vcf | grep -v '0/0' | grep -v "\./\." >> tools/sniffles1/sniffles1.vcf
+bgzip -c tools/sniffles1/sniffles1.vcf > tools/sniffles1/sniffles1.vcf.gz
+tabix tools/sniffles1/sniffles1.vcf.gz
+```
+
+# Run sniffles2
+
+7a) Run sniffles2 (v2.0.2):
+```sh
+sniffles --input alns/HG002_all.bam --vcf tools/sniffles2/sniffles2.call.vcf --genotype-vcf trio.vcf
+```
+7b) Prepare for truvari:
+```sh
+grep '#' tools/sniffles2/sniffles2.call.vcf > tools/sniffles2/sniffles2.sort.vcf
+grep -v '#' tools/sniffles2/sniffles2.call.vcf | sort -k 1,1 -k 2,2n >> tools/sniffles2/sniffles2.sort.vcf
+awk -F '\t' '{if($1=="#CHROM") {for(i=1;i<10;i++) printf($i"\t"); print($10);} else print($0);}' tools/sniffles2/sniffles2.sort.vcf > temp.vcf
+sed -i 'N;122 a ##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# High-quality variant reads">' temp.vcf
+sed -i 'N;122 a ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="# Genotype quality">' temp.vcf
+grep '#' temp.vcf > tools/sniffles2/sniffles2.vcf
+grep -v '#' temp.vcf | grep -v '0/0' | grep -v "\./\." >> tools/sniffle2/sniffles2.vcf
+rm temp.vcf
+bgzip -c tools/sniffles2/sniffles2.vcf > tools/sniffles2/sniffles2.vcf.gz
+tabix tools/sniffles2/sniffles2.vcf.gz
+```
+
+# Run cuteSV2
+
+8a) Run cuteSV2 (v2.0.2):
+```sh
+cuteSV alns/HG002_all.bam ref/human_hs37d5.fasta tools/cutesv/cutesv.call.vcf ./ --max_cluster_bias_INS 1000 --diff_ratio_merging_INS 0.5 --max_cluster_bias_DEL 1000 --diff_ratio_merging_DEL 0.5 -Ivcf trio.vcf -q 10
+```
+8b) Prepare for truvari:
+```sh
+grep '#' tools/cutesv/cutesv.call.vcf > tools/cutesv/cutesv.vcf
+grep -v '#' tools/cutesv/cutesv.call.vcf | grep -v '0/0' | grep -v "\./\." >> tools/cutesv/cutesv.vcf
+bgzip -c tools/cutesv/cutesv.vcf > tools/cutesv/cutesv.vcf.gz
+tabix tools/cutesv/cutesv.vcf.gz
+# The sample output of cuteSV2 is available at https://doi.org/10.5281/zenodo.7347467.
+```
+
+# Run SVJedi
+
+9a) Run SVJedi (v1.1.6):
+```sh
+samtools fasta alns/HG002_all.bam > alns/HG002_all.fasta
+python3 svjedi.py -v trio.vcf -r ref/human_hs37d5.fasta -i alns/HG002_all.fasta -o tools/svjedi/svjedi.call.vcf
+```
+9b) Prepare for truvari:
+```sh
+grep '#' tools/svjedi/svjedi.call.vcf > tools/svjedi/svjedi.sort.vcf
+grep -v '#' tools/svjedi/svjedi.call.vcf | sort -k 1,1 -k 2,2n >> tools/svjedi/svjedi.sort.vcf
+grep '#' tools/svjedi/svjedi.sort.vcf > tools/svjedi/svjedi.vcf
+grep -v '#' tools/svjedi/svjedi.sort.vcf | grep -v '0/0' | grep -v "\./\." >> tools/svjedi/svjedi.vcf
+bgzip -c tools/svjedi/svjedi.vcf > tools/svjedi/svjedi.vcf.gz
+tabix tools/svjedi/svjedi.vcf.gz
+```
+
+# Final comparison
+
+10a) Compare to NIST ground truth (v3.2.0):
+```sh
+truvari bench -b giab/HG002_SVs_Tier1_v0.6.vcf.gz -c tools/sniffles1/sniffles1.vcf.gz\
+ --includebed giab/HG002_SVs_Tier1_v0.6.bed -o NIST-sniffles1 -p 0 -r 1000 --multimatch --passonly
+truvari bench -b giab/HG002_SVs_Tier1_v0.6.vcf.gz -c tools/sniffles2/sniffles2.vcf.gz\
+ --includebed giab/HG002_SVs_Tier1_v0.6.bed -o NIST-sniffles2 -p 0 -r 1000 --multimatch --passonly
+truvari bench -b giab/HG002_SVs_Tier1_v0.6.vcf.gz -c tools/cutesv/cutesv.vcf.gz\
+ --includebed giab/HG002_SVs_Tier1_v0.6.bed -o NIST-cutesv -p 0 -r 1000 --multimatch --passonly
+truvari bench -b giab/HG002_SVs_Tier1_v0.6.vcf.gz -c tools/svjedi/svjedi.vcf.gz\
+ --includebed giab/HG002_SVs_Tier1_v0.6.bed -o NIST-svjedi -p 0 -r 1000 --multimatch --passonly
+```
+10b) Compare to CMRG ground truth (v3.2.0):
+```sh
+truvari bench -b giab/HG002_GRCh37_CMRG_SV_v1.00.vcf.gz -c tools/sniffles1/sniffles1.vcf.gz\
+ --includebed giab/HG002_GRCh37_CMRG_SV_v1.00.bed -o CMRG-sniffles1 -p 0 -r 1000 --multimatch --passonly
+truvari bench -b giab/HG002_GRCh37_CMRG_SV_v1.00.vcf.gz -c tools/sniffles2/sniffles2.vcf.gz\
+ --includebed giab/HG002_GRCh37_CMRG_SV_v1.00.bed -o CMRG-sniffles2 -p 0 -r 1000 --multimatch --passonly
+truvari bench -b giab/HG002_GRCh37_CMRG_SV_v1.00.vcf.gz -c tools/cutesv/cutesv.vcf.gz\
+ --includebed giab/HG002_GRCh37_CMRG_SV_v1.00.bed -o CMRG-cutesv -p 0 -r 1000 --multimatch --passonly
+truvari bench -b giab/HG002_GRCh37_CMRG_SV_v1.00.vcf.gz -c tools/svjedi/svjedi.vcf.gz\
+ --includebed giab/HG002_GRCh37_CMRG_SV_v1.00.bed -o CMRG-svjedi -p 0 -r 1000 --multimatch --passonly
+```
+
+# Down sample
+
+11) Downsample the original alignment file:
+```sh
+samtools view -h -s 0.66 alns/HG002_all.bam | samtools view -bS > alns/HG002_20x.bam
+samtools view -h -s 0.33 alns/HG002_all.bam | samtools view -bS > alns/HG002_10x.bam
+samtools view -h -s 0.17 alns/HG002_all.bam | samtools view -bS > alns/HG002_5x.bam
+```
View it on GitLab: https://salsa.debian.org/med-team/cutesv/-/commit/42a9fad2ae09c7116358a2da6709c8e93f676216
--
View it on GitLab: https://salsa.debian.org/med-team/cutesv/-/commit/42a9fad2ae09c7116358a2da6709c8e93f676216
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230712/6b2db868/attachment-0001.htm>
More information about the debian-med-commit
mailing list