[med-svn] [Git][med-team/tiddit][upstream] New upstream version 3.3.0+dfsg

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Fri Aug 19 13:45:53 BST 2022



Nilesh Patra pushed to branch upstream at Debian Med / tiddit


Commits:
91690294 by Nilesh Patra at 2022-08-19T18:01:55+05:30
New upstream version 3.3.0+dfsg
- - - - -


8 changed files:

- README.md
- setup.py
- tiddit/__main__.py
- tiddit/tiddit_contig_analysis.pyx
- tiddit/tiddit_coverage.pyx
- tiddit/tiddit_coverage_analysis.pyx
- tiddit/tiddit_signal.pyx
- tiddit/tiddit_variant.pyx


Changes:

=====================================
README.md
=====================================
@@ -8,7 +8,8 @@ On a 30X human genome, the TIDDIT SV module typically completetes within 5 hours
 
 INSTALLATION
 ==============
-TIDDIT requires python3, cython, pysam, and Numpy.
+TIDDIT requires python3 (=> 3.8), cython, pysam, and Numpy.
+
 
 By default, tiddit will require, bwa, fermi2 and ropebwt2 for local assembly; local assembly may be disabled through the "--skip_assembly" parameter.
 
@@ -74,6 +75,7 @@ TIDDIT may be fine-tuned by altering these optional parameters:
 	-d	expected reads orientations, possible values "innie" (-> <-) or "outtie" (<- ->). Default: major orientation within the dataset
 	-p	Minimum number of supporting pairs in order to call a variant (default 3)
 	-r	Minimum number of supporting split reads to call a variant (default 3)
+	--threads	Number of threads (default 1)
 	-q	Minimum mapping quality to consider an alignment (default 5)
 	-n	the ploidy of the organism,(default = 2)
 	-e	clustering distance parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)


=====================================
setup.py
=====================================
@@ -20,8 +20,9 @@ else:
 
 setup(
     name = 'tiddit',
-    version = '3.1.0',
-    url = "https://github.com/J35P312/SVDB",
+    version = '3.3.0',
+
+    url = "https://github.com/SciLifeLab/TIDDIT",
     author = "Jesper Eisfeldt",
     author_email= "jesper.eisfeldt at scilifelab.se",
     ext_modules = ext_modules,


=====================================
tiddit/__main__.py
=====================================
@@ -5,6 +5,7 @@ import time
 import pysam
 import os
 import shutil
+import glob
 
 import tiddit.tiddit_stats as tiddit_stats
 import tiddit.tiddit_signal as tiddit_signal
@@ -16,7 +17,7 @@ import tiddit.tiddit_variant as tiddit_variant
 import tiddit.tiddit_contig_analysis as tiddit_contig_analysis
 
 def main():
-	version="3.1.0"
+	version="3.3.0"
 	parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
 	parser.add_argument("--sv"	 , help="call structural variation", required=False, action="store_true")
 	parser.add_argument("--cov"        , help="generate a coverage bed file", required=False, action="store_true")
@@ -31,6 +32,7 @@ def main():
 		parser.add_argument('-i', type=int, help="paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)")
 		parser.add_argument('-d', type=str,help="expected reads orientations, possible values \"innie\" (-> <-) or \"outtie\" (<- ->). Default: major orientation within the dataset")
 		parser.add_argument('-p', type=int,default=3, help="Minimum number of supporting pairs in order to call a variant (default 3)")
+		parser.add_argument('--threads', type=int,default=1, help="Number of threads (default=1)")
 		parser.add_argument('-r', type=int,default=3, help="Minimum number of supporting split reads to call a variant (default 3)")
 		parser.add_argument('-q', type=int,default=5, help="Minimum mapping quality to consider an alignment (default 5)")
 		parser.add_argument('-n', type=int,default=2, help="the ploidy of the organism,(default = 2)")
@@ -69,8 +71,8 @@ def main():
 				print("error, ropebwt2 executable missing, add ropebwt2 to path, or specify using --ropebwt2")
 				quit()
 
-			if not os.path.isfile(args.ref+".bwt") and not os.path.isfile(args.ref+".64.bwt"):
-				print ("error, The reference must be indexed using bwa index")
+			if not glob.glob("{}*.bwt*".format(args.ref)):
+				print ("error, The reference must be indexed using bwa index; run bwa index, or skip local assembly (--skip_assembly)")
 				quit()
 
 
@@ -89,6 +91,7 @@ def main():
 	
 		bam_file_name=args.bam
 		samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=args.ref)
+
 		bam_header=samfile.header
 		samfile.close()
 
@@ -109,7 +112,7 @@ def main():
 			contigs.append(contig["SN"])
 			contig_number[contig["SN"]]=i
 			contig_length[ contig["SN"] ]=contig["LN"]
-			i+=0
+			i+=1
 
 		prefix=args.o
 		try:
@@ -117,6 +120,8 @@ def main():
 			os.mkdir("{}_tiddit/clips".format(prefix) )
 		except:
 			print("Folder already exists")
+
+		pysam.index("-c","-m","6","-@",str(args.threads),bam_file_name,"{}_tiddit/{}.csi".format(args.o,sample_id))
 	
 		min_mapq=args.q
 		max_ins_len=100000
@@ -130,7 +135,7 @@ def main():
 
 
 		t=time.time()
-		coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id)
+		coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig)
 		print("extracted signals in:")
 		print(t-time.time())
 
@@ -162,7 +167,6 @@ def main():
 		f.write(vcf_header+"\n")
 		
 		t=time.time()
-		#print(sv_clusters)
 		variants=tiddit_variant.main(bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len)
 		print("analyzed clusters in")
 		print(time.time()-t)
@@ -202,7 +206,12 @@ def main():
 			t=time.time()
 			if read.mapq >= args.q:
 				n_reads+=1
-				coverage_data[read.reference_name]=tiddit_coverage.update_coverage(read,args.z,coverage_data[read.reference_name],args.q,end_bin_size[read.reference_name])
+
+				read_position=read.reference_start
+				read_end=read.reference_end
+				read_reference_name=read.reference_name
+
+				coverage_data[read_reference_name]=tiddit_coverage.update_coverage(read_position,read_end,args.z,coverage_data[read_reference_name],end_bin_size[read_reference_name])
 
 		if args.w:
 			tiddit_coverage.print_coverage(coverage_data,bam_header,args.z,"wig",args.o +".wig")


=====================================
tiddit/tiddit_contig_analysis.pyx
=====================================
@@ -28,7 +28,14 @@ def read_contigs(aligned_contigs,prefix,sample_id,min_size):
 			continue
 
 		if read.has_tag("SA") and not (read.is_supplementary or read.is_secondary):
-			split_contigs=tiddit_signal.SA_analysis(read,-2,split_contigs,"SA")
+			split=tiddit_signal.SA_analysis(read,-2,"SA",read.reference_name)
+
+			if split:
+				if not split[2] in split_contigs[split[0]][split[1]]:
+					split_contigs[split[0]][split[1]][split[2]]=[]
+				split_contigs[split[0]][split[1]][split[2]]+=split[3:]
+
+			
 		elif read.has_tag("XA") and not (read.is_supplementary or read.is_secondary):
 			XA=read.get_tag("XA")
 			if XA.count(";") == 1:
@@ -44,7 +51,12 @@ def read_contigs(aligned_contigs,prefix,sample_id,min_size):
 					XA=",".join(xa_list)
 
 				read.set_tag("XA",XA)
-				split_contigs=tiddit_signal.SA_analysis(read,-2,split_contigs,"XA")
+				split=tiddit_signal.SA_analysis(read,-2,"XA",read.reference_name)
+
+				if split:
+					if not split[2] in split_contigs[split[0]][split[1]]:
+						split_contigs[split[0]][split[1]][split[2]]=[]
+					split_contigs[split[0]][split[1]][split[2]]+=split[3:]
 
 		elif not (read.is_supplementary or read.is_secondary) and len(read.cigartuples) > 2:
 
@@ -114,9 +126,9 @@ def main(prefix,sample_id,library,contigs,coverage_data,args):
 	f.close()
 	del clips
 
-	os.system("{} -dNCr {}_tiddit/clips.fa | {} assemble -l 81 - > {}_tiddit/clips.fa.assembly.mag".format(args.ropebwt2,prefix,args.fermi2,prefix))
+	os.system("{} -dNCr {}_tiddit/clips.fa | {} assemble -t {} -l 81 - > {}_tiddit/clips.fa.assembly.mag".format(args.ropebwt2,prefix,args.fermi2,args.threads,prefix))
 	os.system("{} simplify -COS -d 0.8 {}_tiddit/clips.fa.assembly.mag 1> {}_tiddit/clips.fa.assembly.clean.mag 2> /dev/null".format(args.fermi2,prefix,prefix))
-	os.system("{} mem -x intractg {} {}_tiddit/clips.fa.assembly.clean.mag  1> {}_tiddit/clips.sam 2> /dev/null".format(args.bwa,args.ref,prefix,prefix))
+	os.system("{} mem -t {} -x intractg {} {}_tiddit/clips.fa.assembly.clean.mag  1> {}_tiddit/clips.sam 2> /dev/null".format(args.bwa,args.threads,args.ref,prefix,prefix))
 
 	read_contigs("{}_tiddit/clips.sam".format(prefix) , prefix, sample_id, args.z)
 	


=====================================
tiddit/tiddit_coverage.pyx
=====================================
@@ -1,4 +1,5 @@
 import sys
+import time
 cimport numpy
 import numpy
 import math
@@ -6,17 +7,18 @@ cimport cython
 @cython.boundscheck(False)
 @cython.wraparound(False)
 
-def create_coverage(bam_header,bin_size):
+def create_coverage(bam_header,bin_size,c="all"):
 	coverage_data={}
 	end_bin_size={}
 
 	for contig in bam_header["SQ"]:
-		bins= int(math.ceil(contig["LN"]/float(bin_size)))
-		coverage_data[ contig["SN"] ]=numpy.zeros(bins)
-		end_bin_size[contig["SN"]]=contig["LN"]-(bins-1)*bin_size
-
+		if c == "all" or contig["SN"] == c:
+			bins= int(math.ceil(contig["LN"]/float(bin_size)))
+			coverage_data[ contig["SN"] ]=numpy.zeros(bins)
+			end_bin_size[contig["SN"]]=contig["LN"]-(bins-1)*bin_size
+			if c != "all":
+				return(coverage_data[ contig["SN"] ],end_bin_size[contig["SN"]])
 	return(coverage_data,end_bin_size)
-
 def print_coverage(coverage_data,bam_header,bin_size,file_type,outfile):
 	f=open(outfile,"w",buffering=819200)
 
@@ -43,37 +45,34 @@ def print_coverage(coverage_data,bam_header,bin_size,file_type,outfile):
 	f.close()
 
 ctypedef numpy.double_t DTYPE_t
-def update_coverage(read,int bin_size,numpy.ndarray[DTYPE_t, ndim=1] coverage_data,int min_q,int end_bin_size):
-
-	cdef long ref_start=read.reference_start
-	cdef long ref_end=read.reference_end
+def update_coverage(long ref_start,long ref_end,int bin_size,numpy.ndarray[DTYPE_t, ndim=1] coverage_data,int end_bin_size):
 
 	cdef int first_bin=ref_start//bin_size
-	cdef int end_bin=int(ref_end-1)//bin_size
+	cdef int end_bin=(ref_end-1)//bin_size
 
-	cdef int bases_first_bin
+	cdef float bases_first_bin
 
 	if end_bin == first_bin:
 		bases_first_bin=ref_end-ref_start
-		coverage_data[first_bin]=float(bases_first_bin)/bin_size+coverage_data[first_bin]
+		coverage_data[first_bin]=bases_first_bin/bin_size+coverage_data[first_bin]
 
 		return(coverage_data)
 
 	bases_first_bin=((first_bin+1)*bin_size)-ref_start
-	coverage_data[first_bin]=float(bases_first_bin)/bin_size+coverage_data[first_bin]	
-	cdef int bases_last_bin=(ref_end-1)-end_bin*bin_size
+	coverage_data[first_bin]=bases_first_bin/bin_size+coverage_data[first_bin]
+	cdef float bases_last_bin=(ref_end-1)-end_bin*bin_size
+
 
 	if end_bin < len(coverage_data)-1:
-		coverage_data[end_bin]+=float(bases_last_bin)/bin_size
+		coverage_data[end_bin]=bases_last_bin/bin_size+coverage_data[end_bin]
 	else:
-		coverage_data[end_bin]+=float(bases_last_bin)/end_bin_size
+		coverage_data[end_bin]=bases_last_bin/end_bin_size+coverage_data[end_bin]
 
 	for i in range(first_bin+1,end_bin):
-		coverage_data[i]+=1.0
+		coverage_data[i]=1.0+coverage_data[i]
 
 	return(coverage_data)
 
-
 #bam_file_name=sys.argv[1]
 
 #samfile = pysam.AlignmentFile(bam_file_name, "r")


=====================================
tiddit/tiddit_coverage_analysis.pyx
=====================================
@@ -75,6 +75,9 @@ def determine_ploidy(dict coverage_data,contigs,dict library,int ploidy,str pref
 		library["avg_coverage"]=c
 
 	for chromosome in contigs:
+		if not chromosome in coverage_data:
+			continue
+
 		avg_coverage_contig=library[ "avg_coverage_{}".format(chromosome) ]
 		library["contig_ploidy_{}".format(chromosome)]=int(round(ploidy*avg_coverage_contig/library["avg_coverage"]))
 		f.write("{}\t{}\t{}\t{}\n".format(chromosome,avg_coverage_contig/library["avg_coverage"]*ploidy,library["contig_ploidy_{}".format(chromosome)],avg_coverage_contig))


=====================================
tiddit/tiddit_signal.pyx
=====================================
@@ -3,6 +3,7 @@ import sys
 import os
 import itertools
 import time
+from joblib import Parallel, delayed
 
 import tiddit.tiddit_coverage as tiddit_coverage
 
@@ -26,7 +27,7 @@ def find_SA_query_range(SA):
 	a.cigar = tuple(SA_cigar)
 	return(a)
 
-def SA_analysis(read,min_q,splits,tag):
+def SA_analysis(read,min_q,tag,reference_name):
 	#print(read.query_alignment_start,read.query_alignment_end,read.is_reverse,read.cigarstring)
 	suplementary_alignments=read.get_tag(tag).rstrip(";").split(";")
 	
@@ -40,7 +41,6 @@ def SA_analysis(read,min_q,splits,tag):
 				supplementry_alignment=find_SA_query_range(SA_data)
 				SA_lengths.append(supplementry_alignment.query_alignment_end-supplementry_alignment.query_alignment_start)
 
-
 		longest_aln=0
 		for i in range(0,len(ok_q)):
 			if SA_lengths[i] > SA_lengths[longest_aln]:
@@ -48,7 +48,7 @@ def SA_analysis(read,min_q,splits,tag):
 
 		#all alignments fail quality treshold
 		if len(ok_q) == 0:
-			return(splits)
+			return()
 
 		#only one SA pass mapping quality treshold		
 		elif len(ok_q) == 1:
@@ -61,9 +61,11 @@ def SA_analysis(read,min_q,splits,tag):
 	SA_pos=int(SA_data[1])
 
 	if int(SA_data[4]) < min_q:
-		return(splits)
+		return()
+
+	cdef long read_query_alignment_end=read.query_alignment_end
 
-	if (read.query_alignment_start ) < (read.query_length - read.query_alignment_end):
+	if (read.query_alignment_start ) < (read.query_length - read_query_alignment_end):
 		split_pos=read.reference_end+1
 	else:
 		split_pos=read.reference_start+1
@@ -72,21 +74,21 @@ def SA_analysis(read,min_q,splits,tag):
 	SA_chr=SA_data[0]
 
 
-	if (supplementry_alignment.query_alignment_start ) < (supplementry_alignment.query_length - read.query_alignment_end):
+	if (supplementry_alignment.query_alignment_start ) < (supplementry_alignment.query_length - read_query_alignment_end):
 		SA_split_pos=supplementry_alignment.reference_end
 	else:
 		SA_split_pos=supplementry_alignment.reference_start
 
 
-	if SA_chr < read.reference_name:
+	if SA_chr < reference_name:
 		chrA=SA_chr
-		chrB=read.reference_name
+		chrB=reference_name
 		tmp=split_pos
 		split_pos=SA_split_pos
 		SA_split_pos=tmp
 
 	else:
-		chrA=read.reference_name
+		chrA=reference_name
 		chrB=SA_chr
 
 		if chrA == chrB:
@@ -95,118 +97,150 @@ def SA_analysis(read,min_q,splits,tag):
 				split_pos=SA_split_pos
 				SA_split_pos=tmp
 
-	if not chrA in splits:
-		splits[chrA]={}
-
-	if not chrB in splits[chrA]:
-		splits[chrA][chrB]={}
-
-	if not read.query_name in splits[chrA][chrB]:
-		splits[chrA][chrB][read.query_name]=[]
-
+	split=[]
 	if "-" == SA_data[2]:
-		splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,True]
+		split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos, True]
 	else:
-		splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,False]
-
-	return(splits)
+		split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos,False]
+		#splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,False]
 
-def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id):
+	return(split)
 
-	samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
+def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size):
+	print("Collecting signals on contig: {}".format(chromosome))
+	samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))
 	bam_header=samfile.header
-	cdef int bin_size=50
-	cdef str file_type="wig"
-	cdef str outfile=prefix+".tiddit_coverage.wig"
+	coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,bin_size,chromosome)	
 
-	t_update=0
-	t_split=0
-	t_disc=0
-	t_tot=0
+	clips=[]
+	data=[]
+	splits=[]
 
-	coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,bin_size)	
+	clip_dist=100
 
-	cdef dict data={}
-	cdef dict splits={}
-	cdef dict clips={}
+	cdef long read_position
+	cdef long read_end
+	cdef int mapq
 
-	for chrA in bam_header["SQ"]:
+	for read in samfile.fetch(chromosome,until_eof=True):
 
-		clips[chrA["SN"]]=[]
-		for chrB in bam_header["SQ"]:
-			if chrA["SN"] <= chrB["SN"]:
-				if not chrA["SN"] in data:
-					data[chrA["SN"]] = {}
-					splits[chrA["SN"]] = {}
-				data[chrA["SN"]][chrB["SN"]]={}
-				splits[chrA["SN"]][chrB["SN"]]={}
+		if read.is_unmapped or read.is_duplicate:
+			continue
 
+		read_chromosome=read.reference_name
+		mate_chromosome=read.next_reference_name
+		read_position=read.reference_start
+		read_end=read.reference_end
+		read_mapq=read.mapq
+		read_supplementary=read.is_supplementary
 
-	chromosome_set=set([])
-	chromosomes=[]
+		if read_mapq >= min_q:
+			coverage_data=tiddit_coverage.update_coverage(read_position,read_end,bin_size,coverage_data,end_bin_size)
 
-	clip_dist=100
+		if read_supplementary or read.is_secondary:
+			continue
 
-	t_tot=time.time()
-	f=open("{}_tiddit/clips_{}.fa".format(prefix,sample_id ),"w")
-	for read in samfile.fetch(until_eof=True):
+		if read_mapq > 1:
+			cigar_tuple=read.cigartuples
+			if (cigar_tuple[0][0] == 4 and cigar_tuple[0][1] > 10) and (cigar_tuple[-1][0] == 0 and cigar_tuple[-1][1] > 30) and len(cigar_tuple) < 7:
+				clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"])
 
-		if read.is_unmapped or read.is_duplicate:
+			elif cigar_tuple[-1][0] == 4 and cigar_tuple[-1][1] > 10 and (cigar_tuple[0][0] == 0 and cigar_tuple[0][1] > 30) and len(cigar_tuple) < 7:
+				clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"])
+
+		if read_mapq < min_q:
 			continue
 
-		t=time.time()
-		if read.mapq >= min_q:
-			coverage_data[read.reference_name]=tiddit_coverage.update_coverage(read,bin_size,coverage_data[read.reference_name],min_q,end_bin_size[read.reference_name])
-		t_update+=time.time()-t
+		if read.has_tag("SA"):
+			split=SA_analysis(read,min_q,"SA",read_chromosome)
+			if split:
+				splits.append(split)
 
-		if not read.reference_name in chromosome_set:
-			print("Collecting signals on contig: {}".format(read.reference_name))
-			chromosome_set.add(read.reference_name)
+		if read.mate_is_unmapped:
+			continue
 
-			if len(chromosomes):
-				for clip in clips[ chromosomes[-1] ]:
-					f.write("".join( clip ))
-				del clips[ chromosomes[-1] ]
 
-			chromosomes.append(read.reference_name)
+		if ( abs(read.isize) > max_ins or mate_chromosome != read_chromosome ):
+			read_query_name=read.query_name
 
+			if mate_chromosome < read_chromosome:
+				chrA=mate_chromosome
+				chrB=read_chromosome
+			else:
+				chrA=read_chromosome
+				chrB=mate_chromosome
 
-		t=time.time()
+			data.append([chrA,chrB,read_query_name,read_position+1,read_end+1,read.is_reverse,read_chromosome])
 
-		if read.has_tag("SA") and not (read.is_supplementary or read.is_secondary) and read.mapq >= min_q:
-			splits=SA_analysis(read,min_q,splits,"SA")
+	f=open("{}_tiddit/clips/{}.fa".format(prefix,chromosome),"w")
+	for clip in clips:
+		f.write("".join(clip))
+	f.close()
 
-		if not (read.is_supplementary or read.is_secondary) and read.mapq > 1:
-			if (read.cigartuples[0][0] == 4 and read.cigartuples[0][1] > 10) and (read.cigartuples[-1][0] == 0 and read.cigartuples[-1][1] > 30) and len(read.cigartuples) < 7:
-				clips[read.reference_name].append([">{}|{}|{}\n".format(read.query_name,read.reference_name,read.reference_start+1),read.query_sequence+"\n"])
+	return(chromosome,data,splits,coverage_data, "{}_tiddit/clips/{}.fa".format(prefix,chromosome) )
 
-			elif read.cigartuples[-1][0] == 4 and read.cigartuples[-1][1] > 10 and (read.cigartuples[0][0] == 0 and read.cigartuples[0][1] > 30) and len(read.cigartuples) < 7:
-				clips[read.reference_name].append([">{}|{}|{}\n".format(read.query_name,read.reference_name,read.reference_start+1),read.query_sequence+"\n"])
+def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig):
 
-		t_split+=time.time()-t
+	samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))
+	bam_header=samfile.header
+	samfile.close()
+	cdef int bin_size=50
+	cdef str file_type="wig"
+	cdef str outfile=prefix+".tiddit_coverage.wig"
 
-		t=time.time()
-		if ( abs(read.isize) > max_ins or read.next_reference_name != read.reference_name ) and read.mapq >= min_q and not (read.is_supplementary or read.is_secondary):
-			if read.next_reference_name < read.reference_name:
-				chrA=read.next_reference_name
-				chrB=read.reference_name
-			else:
-				chrA=read.reference_name
-				chrB=read.next_reference_name
+	t_tot=0
 
-			if not read.query_name in data[chrA][chrB]:
-				data[chrA][chrB][read.query_name]=[]
+	cdef dict data={}
+	cdef dict splits={}
+	cdef dict coverage_data={}
+	cdef list clip_fasta=[]
+	chromosomes=[]
+	
+	for chrA in bam_header["SQ"]:
+		if chrA["LN"] < min_contig:
+			continue
 
+		chromosomes.append(chrA["SN"])
+		data[chrA["SN"]]={}
+		splits[chrA["SN"]]={}
+		for chrB in bam_header["SQ"]:
+			data[chrA["SN"]][chrB["SN"]]={}
+			splits[chrA["SN"]][chrB["SN"]]={}
 
-			data[chrA][chrB][read.query_name].append([read.reference_start+1,read.reference_end+1,read.is_reverse,read.reference_name])
-		t_disc+=time.time()-t
+	t=time.time()
+	res=Parallel(n_jobs=threads)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size) for chromosome in chromosomes )
 
-	f.close()
+	chromosomes=set(chromosomes)
+	for i in range(0,len(res)):
+		coverage_data[ res[i][0] ] = res[i][3]
+
+		if not res[i][0] in chromosomes:
+			continue
+
+		for signal in res[i][1]:
+			if not signal[0] in data:
+				continue
+	
+			if not signal[2] in data[ signal[0] ][ signal[1] ]:
+				data[ signal[0] ][signal[1]][signal[2]]=[]
+			data[ signal[0] ][signal[1]][signal[2]].append(signal[3:])
+
+		for signal in res[i][2]:
+			if not signal[0] in splits:
+				continue
+
+			if not signal[2] in splits[ signal[0] ][ signal[1] ]:
+				splits[ signal[0] ][signal[1]][signal[2]]=[]
+			splits[ signal[0] ][signal[1]][signal[2]]+=signal[3:]
+
+		clip_fasta.append(res[i][4])
+
+	t_tot=time.time()-t
 
-	print("total",time.time()-t_tot)
-	print("coverage",t_update)
-	print("split",t_split)
-	print("disc",t_disc)
+	print("total",t_tot)
+	#print("coverage",t_update)
+	#print("split",t_split)
+	#print("disc",t_disc)
 
 	#print("writing coverage wig")
 	#tiddit_coverage.print_coverage(coverage_data,bam_header,bin_size,file_type,outfile)
@@ -217,7 +251,6 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
 
 	for chrA in data:
 		for chrB in data[chrA]:
-
 			for fragment in data[chrA][chrB]:
 				if len(data[chrA][chrB][fragment]) < 2:
 					continue
@@ -234,27 +267,20 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
 						out=data[chrA][chrB][fragment][1][0:-1]+data[chrA][chrB][fragment][0][0:-1]
 
 				f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, out )))  )
-
 	f.close()
 
 	f=open("{}_tiddit/splits_{}.tab".format(prefix,sample_id),"w")
 
 	for chrA in splits:
 		for chrB in splits[chrA]:
-
 			for fragment in splits[chrA][chrB]:
 				f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, splits[chrA][chrB][fragment] )))  )
-
 	f.close()
 
-	f=open("{}_tiddit/clips_{}.fa".format(prefix,sample_id),"a")
-
-	for chrA in clips:
-		for clip in clips[chrA]:
-			f.write("".join( clip ))
+	f=open("{}_tiddit/clips_{}.fa".format(prefix,sample_id),"w")
+	for clips in clip_fasta:
+		for clip in open(clips):
+			f.write(clip)
 	f.close()
 
-
 	return(coverage_data)
-	#return(coverage_data,clips)
-	#return(coverage_data,clips)


=====================================
tiddit/tiddit_variant.pyx
=====================================
@@ -1,6 +1,7 @@
 import time
 import math
 import numpy
+from joblib import Parallel, delayed
 
 #from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
 from pysam import AlignmentFile, AlignedSegment
@@ -13,6 +14,7 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
 	cdef int n_discs=0
 	cdef int n_splits=0
 
+
 	cdef int crossing_r=0
 	cdef int crossing_f=0
 
@@ -25,21 +27,32 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
 	if q_end > contig_length:
 		q_end=contig_length
 
+	if q_start >= q_end:
+		q_start=q_end-10
+
+	cdef long read_reference_start
+	cdef long read_reference_end
+
+	cdef long r_start
+	cdef long r_end
+
 	for read in samfile.fetch(chr, q_start, q_end):
 		if read.is_unmapped:
 			continue
+
+		read_reference_start=read.reference_start
+
 		if not read.mate_is_unmapped:
-			if read.next_reference_start > end and read.reference_start > end:
+			if read.next_reference_start > end and read_reference_start > end:
 				continue		
 		else:
-			if read.reference_start > end:
+			if read_reference_start > end:
 				continue
 
-
 		if read.is_duplicate:
 			continue
 		
-		if not (read.reference_start > end):
+		if not (read_reference_start > end):
 			n_reads+=1
 			if read.mapq < min_q:
 				low_q+=1
@@ -47,27 +60,31 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
 		if read.mapq < min_q:
 			continue
 
-		r_start=read.reference_start
-		r_end=read.reference_end
+		read_reference_end=read.reference_end
+		read_reference_name=read.reference_name
+		read_next_reference_name=read.next_reference_name
 
-		if r_start < bp-10 and r_end > bp:
+		r_start=read_reference_start
+		r_end=read_reference_end
+
+		if read_reference_start < bp-10 and r_end > bp:
 			crossing_r+=1
 
 		mate_bp_read= (read.next_reference_start < bp and r_end > bp)
-		discordant= ( abs(read.isize) > max_ins or read.next_reference_name != read.reference_name )
+		discordant= ( abs(read.isize) > max_ins or read_next_reference_name != read_reference_name )
 
 		if mate_bp_read and not discordant:
 			crossing_f+=1
 
-		if read.reference_end < start:
+		if read_reference_end < start:
 			continue
-		elif read.reference_start > end:
+		elif read_reference_start > end:
 			continue
 
-		if read.reference_start < start:
+		if read_reference_start < start:
 			r_start=start
 		
-		if read.reference_end > end:
+		if read_reference_end > end:
 			r_end=end
 
 		bases+=r_end-r_start+1
@@ -75,7 +92,7 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
 		if read.has_tag("SA"):
 			n_splits+=1
 
-		if (abs(read.isize) > max_ins) or (read.next_reference_name != read.reference_name) :
+		if discordant:
 			n_discs+=1
 
 	coverage= bases/(end-start+1)
@@ -168,318 +185,321 @@ def sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_s
 
 	return(filt)
 
-def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len):
-
-	variants={}
-	#cdef AlignmentFile samfile  = AlignmentFile(bam_file_name, "r")
-	samfile  = AlignmentFile(bam_file_name, "r",reference_filename=args.ref)
-
-	contig_seqs={}
-	new_seq=False
-	if not args.skip_assembly:
-		for line in open("{}_tiddit/clips.fa.assembly.clean.mag".format(args.o)):			
-
-			if not new_seq and line[0] == "@" and "\t" in line:
-				name=line.split("\t")[0][1:]
-				new_seq=True
+def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs):
 
-			elif new_seq:
-				contig_seqs[name]=line.strip("\n")
-				new_seq=False
-
-	for chrA in sv_clusters:
-		variants[chrA]=[]
-		for chrB in sv_clusters[chrA]:
-			variants[chrB]=[]
+	samfile  = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
+	variants=[]
 
 	var_n=0
-	for chrA in sv_clusters:
-		for chrB in sv_clusters[chrA]:
+	for chrB in sv_clusters[chrA]:
+		for cluster in sv_clusters[chrA][chrB]:
+			n_discordants=sv_clusters[chrA][chrB][cluster]["N_discordants"]
+			n_splits=sv_clusters[chrA][chrB][cluster]["N_splits"]
+			n_contigs=sv_clusters[chrA][chrB][cluster]["N_contigs"]
 
-			for cluster in sv_clusters[chrA][chrB]:
-				n_discordants=sv_clusters[chrA][chrB][cluster]["N_discordants"]
-				n_splits=sv_clusters[chrA][chrB][cluster]["N_splits"]
-				n_contigs=sv_clusters[chrA][chrB][cluster]["N_contigs"]
-
-				if (n_discordants < args.p and n_splits < args.r) and not n_contigs:
+			if (n_discordants < args.p and n_splits < args.r) and not n_contigs:
 					continue
 
-				posA=sv_clusters[chrA][chrB][cluster]["posA"]
-				posB=sv_clusters[chrA][chrB][cluster]["posB"]
-
-				if chrA == chrB and posA > posB:
-					posT=posA
-					posA=posB
-					posB=posT
-
-				if chrA == chrB and abs(posA-posB) < args.z:
-					continue
+			posA=sv_clusters[chrA][chrB][cluster]["posA"]
+			posB=sv_clusters[chrA][chrB][cluster]["posB"]
 
-				s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startA"]/50.0))
-				e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endA"]/50.0))+1
-				avg_a=numpy.average(coverage_data[chrA][s:e])
+			if chrA == chrB and posA > posB:
+				posT=posA
+				posA=posB
+				posB=posT
 
-				if avg_a > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
-					continue
-				elif (args.max_coverage*n_discordants/avg_a < args.p_ratio/2 and args.max_coverage*n_splits/avg_a < args.r_ratio/2) and not n_contigs:
-					continue
+			if chrA == chrB and abs(posA-posB) < args.z:
+				continue
 
-				avg_b=numpy.average(coverage_data[chrA][s:e])
-				if avg_b == 0:
-					continue
-				elif avg_b > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
-					continue
-				elif (args.max_coverage*n_discordants/avg_b < args.p_ratio/2 and args.max_coverage*n_splits/avg_b < args.r_ratio/2) and not n_contigs:
-					continue
+			s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startA"]/50.0))
+			e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endA"]/50.0))+1
+			avg_a=numpy.average(coverage_data[chrA][s:e])
 
+			if avg_a > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
+				continue
+			elif (args.max_coverage*n_discordants/avg_a < args.p_ratio/2 and args.max_coverage*n_splits/avg_a < args.r_ratio/2) and not n_contigs:
+				continue
 
-				var_n+=1
+			avg_b=numpy.average(coverage_data[chrA][s:e])
+			if avg_b == 0:
+				continue
+			elif avg_b > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
+				continue
+			elif (args.max_coverage*n_discordants/avg_b < args.p_ratio/2 and args.max_coverage*n_splits/avg_b < args.r_ratio/2) and not n_contigs:
+				continue
 
-				sample_data={}
-				for sample in samples:
 
-					coverageA,frac_low_qA,n_discsA,n_splitsA,crossing_f_A,crossing_r_A=get_region(samfile,chrA,sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"],posA,min_mapq,max_ins_len,contig_number)
-					coverageB,frac_low_qB,n_discsB,n_splitsB,crossing_f_B,crossing_r_B=get_region(samfile,chrB,sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"],posB,min_mapq,max_ins_len,contig_number)
+			var_n+=1
+			sample_data={}
+			for sample in samples:
 
-					sample_data[sample]={}
-					sample_data[sample]={"covA":coverageA,"QA":frac_low_qA,"discA":n_discsA,"splitA":n_splitsA,"refRA":crossing_r_A,"refFA":crossing_f_A}
-					sample_data[sample].update({"covB":coverageB,"QB":frac_low_qA,"discB":n_discsB,"splitB":n_splitsB,"refRB":crossing_r_B,"refFB":crossing_f_B})
+				coverageA,frac_low_qA,n_discsA,n_splitsA,crossing_f_A,crossing_r_A=get_region(samfile,chrA,sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"],posA,min_mapq,max_ins_len,contig_number)
+				coverageB,frac_low_qB,n_discsB,n_splitsB,crossing_f_B,crossing_r_B=get_region(samfile,chrB,sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"],posB,min_mapq,max_ins_len,contig_number)
 
-					if chrA != chrB:
-						sample_data[sample]["covM"]=0
-					elif abs(posB - posA) < 1000:
-						if posA < posB:
-							coverageM,_,_,_,_,_=get_region(samfile,chrA,posA,posB,posA,min_mapq,max_ins_len,contig_number)
-						else:
-							coverageM,_,_,_,_,_=get_region(samfile,chrA,posB,posA,posB,min_mapq,max_ins_len,contig_number)
+				sample_data[sample]={}
+				sample_data[sample]={"covA":coverageA,"QA":frac_low_qA,"discA":n_discsA,"splitA":n_splitsA,"refRA":crossing_r_A,"refFA":crossing_f_A}
+				sample_data[sample].update({"covB":coverageB,"QB":frac_low_qA,"discB":n_discsB,"splitB":n_splitsB,"refRB":crossing_r_B,"refFB":crossing_f_B})
 
-						sample_data[sample]["covM"]=coverageM
+				if chrA != chrB:
+					sample_data[sample]["covM"]=0
+				elif abs(posB - posA) < 1000:
+					if posA < posB:
+						coverageM,_,_,_,_,_=get_region(samfile,chrA,posA,posB,posA,min_mapq,max_ins_len,contig_number)
 					else:
-						s=int(math.floor(posA/50.0))
-						e=int(math.floor(posB/50.0))+1
-						sample_data[sample]["covM"]=numpy.average(coverage_data[chrA][s:e] )
-				
-				inverted=0
-				non_inverted=0
-				for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"]) ):
-					if sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_discordants"][i]:
-						inverted+=1
-					else:
-						non_inverted+=1
+						coverageM,_,_,_,_,_=get_region(samfile,chrA,posB,posA,posB,min_mapq,max_ins_len,contig_number)
 
-				for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"]) ):
-					if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_splits"][i]:
-						inverted+=1
-					else:
-						non_inverted+=1
+					sample_data[sample]["covM"]=coverageM
+				else:
+					s=int(math.floor(posA/50.0))
+					e=int(math.floor(posB/50.0))+1
+					sample_data[sample]["covM"]=numpy.average(coverage_data[chrA][s:e] )
+				
+			inverted=0
+			non_inverted=0
+			for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"]) ):
+				if sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_discordants"][i]:
+					inverted+=1
+				else:
+					non_inverted+=1
 
-				for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"]) ):
-					if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_contigs"][i]:
-						inverted+=1
-					else:
-						non_inverted+=1
+			for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"]) ):
+				if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_splits"][i]:
+					inverted+=1
+				else:
+					non_inverted+=1
 
-				svtype,cn=find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,library)
+			for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"]) ):
+				if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_contigs"][i]:
+					inverted+=1
+				else:
+					non_inverted+=1
 
-				filt=sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_splits,library,sample_data[sample]["discA"],sample_data[sample]["discB"],sample_data[sample]["splitA"],sample_data[sample]["splitB"],n_contigs)
-				format_col="GT:CN:COV:DR:SR:LQ:RR:RD"
+			svtype,cn=find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,library)
 
-				#configure filters for CNV based on Read depth
-				for sample in samples:
-					if "DEL" in svtype:
-						#homozygout del based on coverage 
-						if cn == 0:
-							filt="PASS"
+			filt=sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_splits,library,sample_data[sample]["discA"],sample_data[sample]["discB"],sample_data[sample]["splitA"],sample_data[sample]["splitB"],n_contigs)
+			format_col="GT:CN:COV:DV:RV:LQ:RR:RD"
 
-						covA=sample_data[sample]["covA"]
-						covM=sample_data[sample]["covM"]
-						covB=sample_data[sample]["covB"]
+			#configure filters for CNV based on Read depth
+			for sample in samples:
+				if "DEL" in svtype:
+					#homozygout del based on coverage 
+					if cn == 0:
+						filt="PASS"
 
-						#normal coverage on the flanking regions, abnormal inbetween
-						if covA > covM*(cn+0.9) and covB > covM*(cn+0.9):
-							filt="PASS"
+					covA=sample_data[sample]["covA"]
+					covM=sample_data[sample]["covM"]
+					covB=sample_data[sample]["covB"]
 
-					#too few reads, but clear RD signal
-					elif "DUP" in svtype and filt == "BelowExpectedLinks":
+					#normal coverage on the flanking regions, abnormal inbetween
+					if covA > covM*(cn+0.9) and covB > covM*(cn+0.9):
 						filt="PASS"
 
+				#too few reads, but clear RD signal
+				elif "DUP" in svtype and filt == "BelowExpectedLinks":
+					filt="PASS"
 
 
-				if svtype != "BND":
-					info=["SVTYPE={}".format(svtype),"SVLEN={}".format(posB-posA),"END={}".format(posB)]
-					alt="<{}>".format(svtype)
 
-					info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
-					info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
-					info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
-					info+=["LFB={},{}".format(sample_data[sample]["discB"],sample_data[sample]["splitB"])]
-					info+=["LTE={},{}".format(n_discordants,n_splits)]
+			if svtype != "BND":
+				info=["SVTYPE={}".format(svtype),"SVLEN={}".format(posB-posA),"END={}".format(posB)]
+				alt="<{}>".format(svtype)
 
-					if n_contigs:
-						for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
-							if "_" in c:
-								c=c.split("_")[0]
-							ctgs=[ contig_seqs[c] ]
-						info+=["CTG={}".format("|".join(ctgs) )]
+				info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
+				info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
+				info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
+				info+=["LFB={},{}".format(sample_data[sample]["discB"],sample_data[sample]["splitB"])]
+				info+=["LTE={},{}".format(n_discordants,n_splits)]
 
-					else:
-						info+=["CTG=."]
+				if n_contigs:
+					for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
+						if "_" in c:
+							c=c.split("_")[0]
+						ctgs=[ contig_seqs[c] ]
+					info+=["CTG={}".format("|".join(ctgs) )]
 
+				else:
+					info+=["CTG=."]
 
 
-					info=";".join(info)
-					variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt,".",filt,info,format_col]
-					for sample in samples:
-						GT="./."
 
-						if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
-							GT="0/1"
-						if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
-							GT="1/1"
-						if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
-							GT="1/1"
-						if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
-							if sample_data[sample]["covB"]:
-								if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
-									GT="1/1"
-								else:
-									GT="0/1"
+				info=";".join(info)
+				variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt,".",filt,info,format_col]
+				for sample in samples:
+					GT="./."
+
+					if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
+						GT="0/1"
+					if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
+						GT="1/1"
+					if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
+						GT="1/1"
+					if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
+						if sample_data[sample]["covB"]:
+							if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
+								GT="1/1"
 							else:
-									GT="1/1"
+								GT="0/1"
+						else:
+							GT="1/1"
 
-							if sample_data[sample]["covA"]:
-								if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
-									GT="1/1"
-								else:
-									GT="0/1"
-							else:
+						if sample_data[sample]["covA"]:
+							if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
 								GT="1/1"
-
-
-						if "DEL" in alt:
-							if cn == 0:
-								GT = "1/1"
-							else:
-								GT= "0/1"
-						elif "DUP" in alt:
-							if cn >= 2*library["contig_ploidy_{}".format(chrA)]:
-								GT= "1/1"
 							else:
 								GT="0/1"
+						else:
+							GT="1/1"
+
+
+					if "DEL" in alt:
+						if cn == 0:
+							GT = "1/1"
+						else:
+							GT= "0/1"
+					elif "DUP" in alt:
+						if cn >= 2*library["contig_ploidy_{}".format(chrA)]:
+							GT= "1/1"
+						else:
+							GT="0/1"
 							
-						variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
-					variants[chrA].append([posA,variant])
+					variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
+				variants.append([chrA,posA,variant])
+			else:
+				info=["SVTYPE=BND".format(svtype)]
+				inverted=False
+				before=True
+
+				if posA == sv_clusters[chrA][chrB][cluster]["endA"]:
+					before=False
+
+				if inverted  > non_inverted:
+					inverted=True
+
+				if not inverted and not before:
+					alt_str_a="N[{}:{}[".format(chrB,posB)
+					alt_str_b="]{}:{}]N".format(chrA,posA)
+				elif not inverted and before:
+					alt_str_a="]{}:{}]N".format(chrB,posB)
+					alt_str_b="N[{}:{}[".format(chrA,posA)
+				elif inverted and  not before:
+					alt_str_a="N]{}:{}]".format(chrB,posB)
+					alt_str_b="[{}:{}[N".format(chrA,posA)
 				else:
-					info=["SVTYPE=BND".format(svtype)]
-					inverted=False
-					before=True
-
-					if posA == sv_clusters[chrA][chrB][cluster]["endA"]:
-						before=False
-
-					if inverted  > non_inverted:
-						inverted=True
-
-					if not inverted and not before:
-						alt_str_a="N[{}:{}[".format(chrB,posB)
-						alt_str_b="]{}:{}]N".format(chrA,posA)
-					elif not inverted and before:
-						alt_str_a="]{}:{}]N".format(chrB,posB)
-						alt_str_b="N[{}:{}[".format(chrA,posA)
-					elif inverted and  not before:
-						alt_str_a="N]{}:{}]".format(chrB,posB)
-						alt_str_b="[{}:{}[N".format(chrA,posA)
-					else:
-						alt_str_a="[{}:{}[N".format(chrB,posB)
-						alt_str_b="N]{}:{}]".format(chrA,posA)
+					alt_str_a="[{}:{}[N".format(chrB,posB)
+					alt_str_b="N]{}:{}]".format(chrA,posA)
 
-					info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
-					info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
-					info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
-					info+=["LFB={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
-					info+=["LTE={},{}".format(n_discordants,n_splits)]
+				info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
+				info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
+				info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
+				info+=["LFB={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
+				info+=["LTE={},{}".format(n_discordants,n_splits)]
 
-					if n_contigs:
-						for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
-							if "_" in c:
-								c=c.split("_")[0]
+				if n_contigs:
+					for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
+						if "_" in c:
+							c=c.split("_")[0]
 
-							ctgs=[ contig_seqs[c] ]
-						info+=["CTG={}".format("|".join(ctgs) )]
+						ctgs=[ contig_seqs[c] ]
+					info+=["CTG={}".format("|".join(ctgs) )]
 
-					else:
-						info+=["CTG=."]
+				else:
+					info+=["CTG=."]
 
 
 
-					info=";".join(info)
-					variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt_str_a,".",filt,info,format_col]
-					for sample in samples:
-						GT="./."
-						if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
-							GT="0/1"
-						if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
-							GT="1/1"
-						if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
-							GT="1/1"
-						if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
-							if sample_data[sample]["covB"]:
-								if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
-									GT="1/1"
-								else:
-									GT="0/1"
+				info=";".join(info)
+				variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt_str_a,".",filt,info,format_col]
+				for sample in samples:
+					GT="./."
+					if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
+						GT="0/1"
+					if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
+						GT="1/1"
+					if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
+						GT="1/1"
+					if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
+						if sample_data[sample]["covB"]:
+							if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
+								GT="1/1"
 							else:
-									GT="1/1"
-
-							if sample_data[sample]["covA"]:
-								if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
-									GT="1/1"
-								else:
-									GT="0/1"
-
+								GT="0/1"
+						else:
+								GT="1/1"
 
+						if sample_data[sample]["covA"]:
+							if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
+								GT="1/1"
 							else:
-									GT="1/1"
+								GT="0/1"
 
 
+						else:
+								GT="1/1"
 
-						variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
-					variants[chrA].append([posA,variant])
 
 
-					variant=[chrB,str(posB),"SV_{}_2".format(var_n),"N",alt_str_b,".",filt,info,format_col]
-					for sample in samples:
-						GT="./."
-						if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
-							GT="0/1"
-						if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
-							GT="1/1"
-						if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
-							GT="1/1"
-						if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
-							if sample_data[sample]["covB"]:
-								if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
-									GT="1/1"
-								else:
-									GT="0/1"
-							else:
-									GT="1/1"
+					variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
+				variants.append([chrA,posA,variant])
 
-							if sample_data[sample]["covA"]:
-								if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
-									GT="1/1"
-								else:
-									GT="0/1"
 
+				variant=[chrB,str(posB),"SV_{}_2".format(var_n),"N",alt_str_b,".",filt,info,format_col]
+				for sample in samples:
+					GT="./."
+					if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
+						GT="0/1"
+					if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
+						GT="1/1"
+					if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
+						GT="1/1"
+					if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
+						if sample_data[sample]["covB"]:
+							if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
+								GT="1/1"
+							else:
+								GT="0/1"
+						else:
+								GT="1/1"
 
+						if sample_data[sample]["covA"]:
+							if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
+								GT="1/1"
 							else:
-									GT="1/1"
+								GT="0/1"
+						else:
+								GT="1/1"
 
 
 
 
-						variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
-					variants[chrB].append([posB,variant])				
+					variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
+				variants.append([chrB,posB,variant])				
 
 	samfile.close()
 	return(variants)
+
+def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len):
+	contig_seqs={}
+	new_seq=False
+	if not args.skip_assembly:
+		for line in open("{}_tiddit/clips.fa.assembly.clean.mag".format(args.o)):			
+
+			if not new_seq and line[0] == "@" and "\t" in line:
+				name=line.split("\t")[0][1:]
+				new_seq=True
+
+			elif new_seq:
+				contig_seqs[name]=line.strip("\n")
+				new_seq=False
+	variants={}
+	for chrA in sv_clusters:
+		variants[chrA]=[]
+		for chrB in sv_clusters[chrA]:
+			variants[chrB]=[]
+
+	variants_list=Parallel(n_jobs=args.threads)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
+
+	for v in variants_list:
+		for variant in v:
+			variants[ variant[0] ].append( [ variant[1],variant[2] ] )
+		
+	return(variants)



View it on GitLab: https://salsa.debian.org/med-team/tiddit/-/commit/91690294192730217026f2c1710e063d6b9e0b07

-- 
View it on GitLab: https://salsa.debian.org/med-team/tiddit/-/commit/91690294192730217026f2c1710e063d6b9e0b07
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220819/d6273407/attachment-0001.htm>


More information about the debian-med-commit mailing list