[med-svn] [Git][med-team/tiddit][upstream] New upstream version 3.3.0+dfsg
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Fri Aug 19 13:45:53 BST 2022
Nilesh Patra pushed to branch upstream at Debian Med / tiddit
Commits:
91690294 by Nilesh Patra at 2022-08-19T18:01:55+05:30
New upstream version 3.3.0+dfsg
- - - - -
8 changed files:
- README.md
- setup.py
- tiddit/__main__.py
- tiddit/tiddit_contig_analysis.pyx
- tiddit/tiddit_coverage.pyx
- tiddit/tiddit_coverage_analysis.pyx
- tiddit/tiddit_signal.pyx
- tiddit/tiddit_variant.pyx
Changes:
=====================================
README.md
=====================================
@@ -8,7 +8,8 @@ On a 30X human genome, the TIDDIT SV module typically completetes within 5 hours
INSTALLATION
==============
-TIDDIT requires python3, cython, pysam, and Numpy.
+TIDDIT requires python3 (=> 3.8), cython, pysam, and Numpy.
+
By default, tiddit will require, bwa, fermi2 and ropebwt2 for local assembly; local assembly may be disabled through the "--skip_assembly" parameter.
@@ -74,6 +75,7 @@ TIDDIT may be fine-tuned by altering these optional parameters:
-d expected reads orientations, possible values "innie" (-> <-) or "outtie" (<- ->). Default: major orientation within the dataset
-p Minimum number of supporting pairs in order to call a variant (default 3)
-r Minimum number of supporting split reads to call a variant (default 3)
+ --threads Number of threads (default 1)
-q Minimum mapping quality to consider an alignment (default 5)
-n the ploidy of the organism,(default = 2)
-e clustering distance parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)
=====================================
setup.py
=====================================
@@ -20,8 +20,9 @@ else:
setup(
name = 'tiddit',
- version = '3.1.0',
- url = "https://github.com/J35P312/SVDB",
+ version = '3.3.0',
+
+ url = "https://github.com/SciLifeLab/TIDDIT",
author = "Jesper Eisfeldt",
author_email= "jesper.eisfeldt at scilifelab.se",
ext_modules = ext_modules,
=====================================
tiddit/__main__.py
=====================================
@@ -5,6 +5,7 @@ import time
import pysam
import os
import shutil
+import glob
import tiddit.tiddit_stats as tiddit_stats
import tiddit.tiddit_signal as tiddit_signal
@@ -16,7 +17,7 @@ import tiddit.tiddit_variant as tiddit_variant
import tiddit.tiddit_contig_analysis as tiddit_contig_analysis
def main():
- version="3.1.0"
+ version="3.3.0"
parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true")
parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true")
@@ -31,6 +32,7 @@ def main():
parser.add_argument('-i', type=int, help="paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)")
parser.add_argument('-d', type=str,help="expected reads orientations, possible values \"innie\" (-> <-) or \"outtie\" (<- ->). Default: major orientation within the dataset")
parser.add_argument('-p', type=int,default=3, help="Minimum number of supporting pairs in order to call a variant (default 3)")
+ parser.add_argument('--threads', type=int,default=1, help="Number of threads (default=1)")
parser.add_argument('-r', type=int,default=3, help="Minimum number of supporting split reads to call a variant (default 3)")
parser.add_argument('-q', type=int,default=5, help="Minimum mapping quality to consider an alignment (default 5)")
parser.add_argument('-n', type=int,default=2, help="the ploidy of the organism,(default = 2)")
@@ -69,8 +71,8 @@ def main():
print("error, ropebwt2 executable missing, add ropebwt2 to path, or specify using --ropebwt2")
quit()
- if not os.path.isfile(args.ref+".bwt") and not os.path.isfile(args.ref+".64.bwt"):
- print ("error, The reference must be indexed using bwa index")
+ if not glob.glob("{}*.bwt*".format(args.ref)):
+ print ("error, The reference must be indexed using bwa index; run bwa index, or skip local assembly (--skip_assembly)")
quit()
@@ -89,6 +91,7 @@ def main():
bam_file_name=args.bam
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=args.ref)
+
bam_header=samfile.header
samfile.close()
@@ -109,7 +112,7 @@ def main():
contigs.append(contig["SN"])
contig_number[contig["SN"]]=i
contig_length[ contig["SN"] ]=contig["LN"]
- i+=0
+ i+=1
prefix=args.o
try:
@@ -117,6 +120,8 @@ def main():
os.mkdir("{}_tiddit/clips".format(prefix) )
except:
print("Folder already exists")
+
+ pysam.index("-c","-m","6","-@",str(args.threads),bam_file_name,"{}_tiddit/{}.csi".format(args.o,sample_id))
min_mapq=args.q
max_ins_len=100000
@@ -130,7 +135,7 @@ def main():
t=time.time()
- coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id)
+ coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig)
print("extracted signals in:")
print(t-time.time())
@@ -162,7 +167,6 @@ def main():
f.write(vcf_header+"\n")
t=time.time()
- #print(sv_clusters)
variants=tiddit_variant.main(bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len)
print("analyzed clusters in")
print(time.time()-t)
@@ -202,7 +206,12 @@ def main():
t=time.time()
if read.mapq >= args.q:
n_reads+=1
- coverage_data[read.reference_name]=tiddit_coverage.update_coverage(read,args.z,coverage_data[read.reference_name],args.q,end_bin_size[read.reference_name])
+
+ read_position=read.reference_start
+ read_end=read.reference_end
+ read_reference_name=read.reference_name
+
+ coverage_data[read_reference_name]=tiddit_coverage.update_coverage(read_position,read_end,args.z,coverage_data[read_reference_name],end_bin_size[read_reference_name])
if args.w:
tiddit_coverage.print_coverage(coverage_data,bam_header,args.z,"wig",args.o +".wig")
=====================================
tiddit/tiddit_contig_analysis.pyx
=====================================
@@ -28,7 +28,14 @@ def read_contigs(aligned_contigs,prefix,sample_id,min_size):
continue
if read.has_tag("SA") and not (read.is_supplementary or read.is_secondary):
- split_contigs=tiddit_signal.SA_analysis(read,-2,split_contigs,"SA")
+ split=tiddit_signal.SA_analysis(read,-2,"SA",read.reference_name)
+
+ if split:
+ if not split[2] in split_contigs[split[0]][split[1]]:
+ split_contigs[split[0]][split[1]][split[2]]=[]
+ split_contigs[split[0]][split[1]][split[2]]+=split[3:]
+
+
elif read.has_tag("XA") and not (read.is_supplementary or read.is_secondary):
XA=read.get_tag("XA")
if XA.count(";") == 1:
@@ -44,7 +51,12 @@ def read_contigs(aligned_contigs,prefix,sample_id,min_size):
XA=",".join(xa_list)
read.set_tag("XA",XA)
- split_contigs=tiddit_signal.SA_analysis(read,-2,split_contigs,"XA")
+ split=tiddit_signal.SA_analysis(read,-2,"XA",read.reference_name)
+
+ if split:
+ if not split[2] in split_contigs[split[0]][split[1]]:
+ split_contigs[split[0]][split[1]][split[2]]=[]
+ split_contigs[split[0]][split[1]][split[2]]+=split[3:]
elif not (read.is_supplementary or read.is_secondary) and len(read.cigartuples) > 2:
@@ -114,9 +126,9 @@ def main(prefix,sample_id,library,contigs,coverage_data,args):
f.close()
del clips
- os.system("{} -dNCr {}_tiddit/clips.fa | {} assemble -l 81 - > {}_tiddit/clips.fa.assembly.mag".format(args.ropebwt2,prefix,args.fermi2,prefix))
+ os.system("{} -dNCr {}_tiddit/clips.fa | {} assemble -t {} -l 81 - > {}_tiddit/clips.fa.assembly.mag".format(args.ropebwt2,prefix,args.fermi2,args.threads,prefix))
os.system("{} simplify -COS -d 0.8 {}_tiddit/clips.fa.assembly.mag 1> {}_tiddit/clips.fa.assembly.clean.mag 2> /dev/null".format(args.fermi2,prefix,prefix))
- os.system("{} mem -x intractg {} {}_tiddit/clips.fa.assembly.clean.mag 1> {}_tiddit/clips.sam 2> /dev/null".format(args.bwa,args.ref,prefix,prefix))
+ os.system("{} mem -t {} -x intractg {} {}_tiddit/clips.fa.assembly.clean.mag 1> {}_tiddit/clips.sam 2> /dev/null".format(args.bwa,args.threads,args.ref,prefix,prefix))
read_contigs("{}_tiddit/clips.sam".format(prefix) , prefix, sample_id, args.z)
=====================================
tiddit/tiddit_coverage.pyx
=====================================
@@ -1,4 +1,5 @@
import sys
+import time
cimport numpy
import numpy
import math
@@ -6,17 +7,18 @@ cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
-def create_coverage(bam_header,bin_size):
+def create_coverage(bam_header,bin_size,c="all"):
coverage_data={}
end_bin_size={}
for contig in bam_header["SQ"]:
- bins= int(math.ceil(contig["LN"]/float(bin_size)))
- coverage_data[ contig["SN"] ]=numpy.zeros(bins)
- end_bin_size[contig["SN"]]=contig["LN"]-(bins-1)*bin_size
-
+ if c == "all" or contig["SN"] == c:
+ bins= int(math.ceil(contig["LN"]/float(bin_size)))
+ coverage_data[ contig["SN"] ]=numpy.zeros(bins)
+ end_bin_size[contig["SN"]]=contig["LN"]-(bins-1)*bin_size
+ if c != "all":
+ return(coverage_data[ contig["SN"] ],end_bin_size[contig["SN"]])
return(coverage_data,end_bin_size)
-
def print_coverage(coverage_data,bam_header,bin_size,file_type,outfile):
f=open(outfile,"w",buffering=819200)
@@ -43,37 +45,34 @@ def print_coverage(coverage_data,bam_header,bin_size,file_type,outfile):
f.close()
ctypedef numpy.double_t DTYPE_t
-def update_coverage(read,int bin_size,numpy.ndarray[DTYPE_t, ndim=1] coverage_data,int min_q,int end_bin_size):
-
- cdef long ref_start=read.reference_start
- cdef long ref_end=read.reference_end
+def update_coverage(long ref_start,long ref_end,int bin_size,numpy.ndarray[DTYPE_t, ndim=1] coverage_data,int end_bin_size):
cdef int first_bin=ref_start//bin_size
- cdef int end_bin=int(ref_end-1)//bin_size
+ cdef int end_bin=(ref_end-1)//bin_size
- cdef int bases_first_bin
+ cdef float bases_first_bin
if end_bin == first_bin:
bases_first_bin=ref_end-ref_start
- coverage_data[first_bin]=float(bases_first_bin)/bin_size+coverage_data[first_bin]
+ coverage_data[first_bin]=bases_first_bin/bin_size+coverage_data[first_bin]
return(coverage_data)
bases_first_bin=((first_bin+1)*bin_size)-ref_start
- coverage_data[first_bin]=float(bases_first_bin)/bin_size+coverage_data[first_bin]
- cdef int bases_last_bin=(ref_end-1)-end_bin*bin_size
+ coverage_data[first_bin]=bases_first_bin/bin_size+coverage_data[first_bin]
+ cdef float bases_last_bin=(ref_end-1)-end_bin*bin_size
+
if end_bin < len(coverage_data)-1:
- coverage_data[end_bin]+=float(bases_last_bin)/bin_size
+ coverage_data[end_bin]=bases_last_bin/bin_size+coverage_data[end_bin]
else:
- coverage_data[end_bin]+=float(bases_last_bin)/end_bin_size
+ coverage_data[end_bin]=bases_last_bin/end_bin_size+coverage_data[end_bin]
for i in range(first_bin+1,end_bin):
- coverage_data[i]+=1.0
+ coverage_data[i]=1.0+coverage_data[i]
return(coverage_data)
-
#bam_file_name=sys.argv[1]
#samfile = pysam.AlignmentFile(bam_file_name, "r")
=====================================
tiddit/tiddit_coverage_analysis.pyx
=====================================
@@ -75,6 +75,9 @@ def determine_ploidy(dict coverage_data,contigs,dict library,int ploidy,str pref
library["avg_coverage"]=c
for chromosome in contigs:
+ if not chromosome in coverage_data:
+ continue
+
avg_coverage_contig=library[ "avg_coverage_{}".format(chromosome) ]
library["contig_ploidy_{}".format(chromosome)]=int(round(ploidy*avg_coverage_contig/library["avg_coverage"]))
f.write("{}\t{}\t{}\t{}\n".format(chromosome,avg_coverage_contig/library["avg_coverage"]*ploidy,library["contig_ploidy_{}".format(chromosome)],avg_coverage_contig))
=====================================
tiddit/tiddit_signal.pyx
=====================================
@@ -3,6 +3,7 @@ import sys
import os
import itertools
import time
+from joblib import Parallel, delayed
import tiddit.tiddit_coverage as tiddit_coverage
@@ -26,7 +27,7 @@ def find_SA_query_range(SA):
a.cigar = tuple(SA_cigar)
return(a)
-def SA_analysis(read,min_q,splits,tag):
+def SA_analysis(read,min_q,tag,reference_name):
#print(read.query_alignment_start,read.query_alignment_end,read.is_reverse,read.cigarstring)
suplementary_alignments=read.get_tag(tag).rstrip(";").split(";")
@@ -40,7 +41,6 @@ def SA_analysis(read,min_q,splits,tag):
supplementry_alignment=find_SA_query_range(SA_data)
SA_lengths.append(supplementry_alignment.query_alignment_end-supplementry_alignment.query_alignment_start)
-
longest_aln=0
for i in range(0,len(ok_q)):
if SA_lengths[i] > SA_lengths[longest_aln]:
@@ -48,7 +48,7 @@ def SA_analysis(read,min_q,splits,tag):
#all alignments fail quality treshold
if len(ok_q) == 0:
- return(splits)
+ return()
#only one SA pass mapping quality treshold
elif len(ok_q) == 1:
@@ -61,9 +61,11 @@ def SA_analysis(read,min_q,splits,tag):
SA_pos=int(SA_data[1])
if int(SA_data[4]) < min_q:
- return(splits)
+ return()
+
+ cdef long read_query_alignment_end=read.query_alignment_end
- if (read.query_alignment_start ) < (read.query_length - read.query_alignment_end):
+ if (read.query_alignment_start ) < (read.query_length - read_query_alignment_end):
split_pos=read.reference_end+1
else:
split_pos=read.reference_start+1
@@ -72,21 +74,21 @@ def SA_analysis(read,min_q,splits,tag):
SA_chr=SA_data[0]
- if (supplementry_alignment.query_alignment_start ) < (supplementry_alignment.query_length - read.query_alignment_end):
+ if (supplementry_alignment.query_alignment_start ) < (supplementry_alignment.query_length - read_query_alignment_end):
SA_split_pos=supplementry_alignment.reference_end
else:
SA_split_pos=supplementry_alignment.reference_start
- if SA_chr < read.reference_name:
+ if SA_chr < reference_name:
chrA=SA_chr
- chrB=read.reference_name
+ chrB=reference_name
tmp=split_pos
split_pos=SA_split_pos
SA_split_pos=tmp
else:
- chrA=read.reference_name
+ chrA=reference_name
chrB=SA_chr
if chrA == chrB:
@@ -95,118 +97,150 @@ def SA_analysis(read,min_q,splits,tag):
split_pos=SA_split_pos
SA_split_pos=tmp
- if not chrA in splits:
- splits[chrA]={}
-
- if not chrB in splits[chrA]:
- splits[chrA][chrB]={}
-
- if not read.query_name in splits[chrA][chrB]:
- splits[chrA][chrB][read.query_name]=[]
-
+ split=[]
if "-" == SA_data[2]:
- splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,True]
+ split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos, True]
else:
- splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,False]
-
- return(splits)
+ split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos,False]
+ #splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,False]
-def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id):
+ return(split)
- samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
+def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size):
+ print("Collecting signals on contig: {}".format(chromosome))
+ samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))
bam_header=samfile.header
- cdef int bin_size=50
- cdef str file_type="wig"
- cdef str outfile=prefix+".tiddit_coverage.wig"
+ coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,bin_size,chromosome)
- t_update=0
- t_split=0
- t_disc=0
- t_tot=0
+ clips=[]
+ data=[]
+ splits=[]
- coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,bin_size)
+ clip_dist=100
- cdef dict data={}
- cdef dict splits={}
- cdef dict clips={}
+ cdef long read_position
+ cdef long read_end
+ cdef int mapq
- for chrA in bam_header["SQ"]:
+ for read in samfile.fetch(chromosome,until_eof=True):
- clips[chrA["SN"]]=[]
- for chrB in bam_header["SQ"]:
- if chrA["SN"] <= chrB["SN"]:
- if not chrA["SN"] in data:
- data[chrA["SN"]] = {}
- splits[chrA["SN"]] = {}
- data[chrA["SN"]][chrB["SN"]]={}
- splits[chrA["SN"]][chrB["SN"]]={}
+ if read.is_unmapped or read.is_duplicate:
+ continue
+ read_chromosome=read.reference_name
+ mate_chromosome=read.next_reference_name
+ read_position=read.reference_start
+ read_end=read.reference_end
+ read_mapq=read.mapq
+ read_supplementary=read.is_supplementary
- chromosome_set=set([])
- chromosomes=[]
+ if read_mapq >= min_q:
+ coverage_data=tiddit_coverage.update_coverage(read_position,read_end,bin_size,coverage_data,end_bin_size)
- clip_dist=100
+ if read_supplementary or read.is_secondary:
+ continue
- t_tot=time.time()
- f=open("{}_tiddit/clips_{}.fa".format(prefix,sample_id ),"w")
- for read in samfile.fetch(until_eof=True):
+ if read_mapq > 1:
+ cigar_tuple=read.cigartuples
+ if (cigar_tuple[0][0] == 4 and cigar_tuple[0][1] > 10) and (cigar_tuple[-1][0] == 0 and cigar_tuple[-1][1] > 30) and len(cigar_tuple) < 7:
+ clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"])
- if read.is_unmapped or read.is_duplicate:
+ elif cigar_tuple[-1][0] == 4 and cigar_tuple[-1][1] > 10 and (cigar_tuple[0][0] == 0 and cigar_tuple[0][1] > 30) and len(cigar_tuple) < 7:
+ clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"])
+
+ if read_mapq < min_q:
continue
- t=time.time()
- if read.mapq >= min_q:
- coverage_data[read.reference_name]=tiddit_coverage.update_coverage(read,bin_size,coverage_data[read.reference_name],min_q,end_bin_size[read.reference_name])
- t_update+=time.time()-t
+ if read.has_tag("SA"):
+ split=SA_analysis(read,min_q,"SA",read_chromosome)
+ if split:
+ splits.append(split)
- if not read.reference_name in chromosome_set:
- print("Collecting signals on contig: {}".format(read.reference_name))
- chromosome_set.add(read.reference_name)
+ if read.mate_is_unmapped:
+ continue
- if len(chromosomes):
- for clip in clips[ chromosomes[-1] ]:
- f.write("".join( clip ))
- del clips[ chromosomes[-1] ]
- chromosomes.append(read.reference_name)
+ if ( abs(read.isize) > max_ins or mate_chromosome != read_chromosome ):
+ read_query_name=read.query_name
+ if mate_chromosome < read_chromosome:
+ chrA=mate_chromosome
+ chrB=read_chromosome
+ else:
+ chrA=read_chromosome
+ chrB=mate_chromosome
- t=time.time()
+ data.append([chrA,chrB,read_query_name,read_position+1,read_end+1,read.is_reverse,read_chromosome])
- if read.has_tag("SA") and not (read.is_supplementary or read.is_secondary) and read.mapq >= min_q:
- splits=SA_analysis(read,min_q,splits,"SA")
+ f=open("{}_tiddit/clips/{}.fa".format(prefix,chromosome),"w")
+ for clip in clips:
+ f.write("".join(clip))
+ f.close()
- if not (read.is_supplementary or read.is_secondary) and read.mapq > 1:
- if (read.cigartuples[0][0] == 4 and read.cigartuples[0][1] > 10) and (read.cigartuples[-1][0] == 0 and read.cigartuples[-1][1] > 30) and len(read.cigartuples) < 7:
- clips[read.reference_name].append([">{}|{}|{}\n".format(read.query_name,read.reference_name,read.reference_start+1),read.query_sequence+"\n"])
+ return(chromosome,data,splits,coverage_data, "{}_tiddit/clips/{}.fa".format(prefix,chromosome) )
- elif read.cigartuples[-1][0] == 4 and read.cigartuples[-1][1] > 10 and (read.cigartuples[0][0] == 0 and read.cigartuples[0][1] > 30) and len(read.cigartuples) < 7:
- clips[read.reference_name].append([">{}|{}|{}\n".format(read.query_name,read.reference_name,read.reference_start+1),read.query_sequence+"\n"])
+def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig):
- t_split+=time.time()-t
+ samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))
+ bam_header=samfile.header
+ samfile.close()
+ cdef int bin_size=50
+ cdef str file_type="wig"
+ cdef str outfile=prefix+".tiddit_coverage.wig"
- t=time.time()
- if ( abs(read.isize) > max_ins or read.next_reference_name != read.reference_name ) and read.mapq >= min_q and not (read.is_supplementary or read.is_secondary):
- if read.next_reference_name < read.reference_name:
- chrA=read.next_reference_name
- chrB=read.reference_name
- else:
- chrA=read.reference_name
- chrB=read.next_reference_name
+ t_tot=0
- if not read.query_name in data[chrA][chrB]:
- data[chrA][chrB][read.query_name]=[]
+ cdef dict data={}
+ cdef dict splits={}
+ cdef dict coverage_data={}
+ cdef list clip_fasta=[]
+ chromosomes=[]
+
+ for chrA in bam_header["SQ"]:
+ if chrA["LN"] < min_contig:
+ continue
+ chromosomes.append(chrA["SN"])
+ data[chrA["SN"]]={}
+ splits[chrA["SN"]]={}
+ for chrB in bam_header["SQ"]:
+ data[chrA["SN"]][chrB["SN"]]={}
+ splits[chrA["SN"]][chrB["SN"]]={}
- data[chrA][chrB][read.query_name].append([read.reference_start+1,read.reference_end+1,read.is_reverse,read.reference_name])
- t_disc+=time.time()-t
+ t=time.time()
+ res=Parallel(n_jobs=threads)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size) for chromosome in chromosomes )
- f.close()
+ chromosomes=set(chromosomes)
+ for i in range(0,len(res)):
+ coverage_data[ res[i][0] ] = res[i][3]
+
+ if not res[i][0] in chromosomes:
+ continue
+
+ for signal in res[i][1]:
+ if not signal[0] in data:
+ continue
+
+ if not signal[2] in data[ signal[0] ][ signal[1] ]:
+ data[ signal[0] ][signal[1]][signal[2]]=[]
+ data[ signal[0] ][signal[1]][signal[2]].append(signal[3:])
+
+ for signal in res[i][2]:
+ if not signal[0] in splits:
+ continue
+
+ if not signal[2] in splits[ signal[0] ][ signal[1] ]:
+ splits[ signal[0] ][signal[1]][signal[2]]=[]
+ splits[ signal[0] ][signal[1]][signal[2]]+=signal[3:]
+
+ clip_fasta.append(res[i][4])
+
+ t_tot=time.time()-t
- print("total",time.time()-t_tot)
- print("coverage",t_update)
- print("split",t_split)
- print("disc",t_disc)
+ print("total",t_tot)
+ #print("coverage",t_update)
+ #print("split",t_split)
+ #print("disc",t_disc)
#print("writing coverage wig")
#tiddit_coverage.print_coverage(coverage_data,bam_header,bin_size,file_type,outfile)
@@ -217,7 +251,6 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
for chrA in data:
for chrB in data[chrA]:
-
for fragment in data[chrA][chrB]:
if len(data[chrA][chrB][fragment]) < 2:
continue
@@ -234,27 +267,20 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
out=data[chrA][chrB][fragment][1][0:-1]+data[chrA][chrB][fragment][0][0:-1]
f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, out ))) )
-
f.close()
f=open("{}_tiddit/splits_{}.tab".format(prefix,sample_id),"w")
for chrA in splits:
for chrB in splits[chrA]:
-
for fragment in splits[chrA][chrB]:
f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, splits[chrA][chrB][fragment] ))) )
-
f.close()
- f=open("{}_tiddit/clips_{}.fa".format(prefix,sample_id),"a")
-
- for chrA in clips:
- for clip in clips[chrA]:
- f.write("".join( clip ))
+ f=open("{}_tiddit/clips_{}.fa".format(prefix,sample_id),"w")
+ for clips in clip_fasta:
+ for clip in open(clips):
+ f.write(clip)
f.close()
-
return(coverage_data)
- #return(coverage_data,clips)
- #return(coverage_data,clips)
=====================================
tiddit/tiddit_variant.pyx
=====================================
@@ -1,6 +1,7 @@
import time
import math
import numpy
+from joblib import Parallel, delayed
#from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
from pysam import AlignmentFile, AlignedSegment
@@ -13,6 +14,7 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
cdef int n_discs=0
cdef int n_splits=0
+
cdef int crossing_r=0
cdef int crossing_f=0
@@ -25,21 +27,32 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
if q_end > contig_length:
q_end=contig_length
+ if q_start >= q_end:
+ q_start=q_end-10
+
+ cdef long read_reference_start
+ cdef long read_reference_end
+
+ cdef long r_start
+ cdef long r_end
+
for read in samfile.fetch(chr, q_start, q_end):
if read.is_unmapped:
continue
+
+ read_reference_start=read.reference_start
+
if not read.mate_is_unmapped:
- if read.next_reference_start > end and read.reference_start > end:
+ if read.next_reference_start > end and read_reference_start > end:
continue
else:
- if read.reference_start > end:
+ if read_reference_start > end:
continue
-
if read.is_duplicate:
continue
- if not (read.reference_start > end):
+ if not (read_reference_start > end):
n_reads+=1
if read.mapq < min_q:
low_q+=1
@@ -47,27 +60,31 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
if read.mapq < min_q:
continue
- r_start=read.reference_start
- r_end=read.reference_end
+ read_reference_end=read.reference_end
+ read_reference_name=read.reference_name
+ read_next_reference_name=read.next_reference_name
- if r_start < bp-10 and r_end > bp:
+ r_start=read_reference_start
+ r_end=read_reference_end
+
+ if read_reference_start < bp-10 and r_end > bp:
crossing_r+=1
mate_bp_read= (read.next_reference_start < bp and r_end > bp)
- discordant= ( abs(read.isize) > max_ins or read.next_reference_name != read.reference_name )
+ discordant= ( abs(read.isize) > max_ins or read_next_reference_name != read_reference_name )
if mate_bp_read and not discordant:
crossing_f+=1
- if read.reference_end < start:
+ if read_reference_end < start:
continue
- elif read.reference_start > end:
+ elif read_reference_start > end:
continue
- if read.reference_start < start:
+ if read_reference_start < start:
r_start=start
- if read.reference_end > end:
+ if read_reference_end > end:
r_end=end
bases+=r_end-r_start+1
@@ -75,7 +92,7 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
if read.has_tag("SA"):
n_splits+=1
- if (abs(read.isize) > max_ins) or (read.next_reference_name != read.reference_name) :
+ if discordant:
n_discs+=1
coverage= bases/(end-start+1)
@@ -168,318 +185,321 @@ def sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_s
return(filt)
-def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len):
-
- variants={}
- #cdef AlignmentFile samfile = AlignmentFile(bam_file_name, "r")
- samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref)
-
- contig_seqs={}
- new_seq=False
- if not args.skip_assembly:
- for line in open("{}_tiddit/clips.fa.assembly.clean.mag".format(args.o)):
-
- if not new_seq and line[0] == "@" and "\t" in line:
- name=line.split("\t")[0][1:]
- new_seq=True
+def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs):
- elif new_seq:
- contig_seqs[name]=line.strip("\n")
- new_seq=False
-
- for chrA in sv_clusters:
- variants[chrA]=[]
- for chrB in sv_clusters[chrA]:
- variants[chrB]=[]
+ samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
+ variants=[]
var_n=0
- for chrA in sv_clusters:
- for chrB in sv_clusters[chrA]:
+ for chrB in sv_clusters[chrA]:
+ for cluster in sv_clusters[chrA][chrB]:
+ n_discordants=sv_clusters[chrA][chrB][cluster]["N_discordants"]
+ n_splits=sv_clusters[chrA][chrB][cluster]["N_splits"]
+ n_contigs=sv_clusters[chrA][chrB][cluster]["N_contigs"]
- for cluster in sv_clusters[chrA][chrB]:
- n_discordants=sv_clusters[chrA][chrB][cluster]["N_discordants"]
- n_splits=sv_clusters[chrA][chrB][cluster]["N_splits"]
- n_contigs=sv_clusters[chrA][chrB][cluster]["N_contigs"]
-
- if (n_discordants < args.p and n_splits < args.r) and not n_contigs:
+ if (n_discordants < args.p and n_splits < args.r) and not n_contigs:
continue
- posA=sv_clusters[chrA][chrB][cluster]["posA"]
- posB=sv_clusters[chrA][chrB][cluster]["posB"]
-
- if chrA == chrB and posA > posB:
- posT=posA
- posA=posB
- posB=posT
-
- if chrA == chrB and abs(posA-posB) < args.z:
- continue
+ posA=sv_clusters[chrA][chrB][cluster]["posA"]
+ posB=sv_clusters[chrA][chrB][cluster]["posB"]
- s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startA"]/50.0))
- e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endA"]/50.0))+1
- avg_a=numpy.average(coverage_data[chrA][s:e])
+ if chrA == chrB and posA > posB:
+ posT=posA
+ posA=posB
+ posB=posT
- if avg_a > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
- continue
- elif (args.max_coverage*n_discordants/avg_a < args.p_ratio/2 and args.max_coverage*n_splits/avg_a < args.r_ratio/2) and not n_contigs:
- continue
+ if chrA == chrB and abs(posA-posB) < args.z:
+ continue
- avg_b=numpy.average(coverage_data[chrA][s:e])
- if avg_b == 0:
- continue
- elif avg_b > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
- continue
- elif (args.max_coverage*n_discordants/avg_b < args.p_ratio/2 and args.max_coverage*n_splits/avg_b < args.r_ratio/2) and not n_contigs:
- continue
+ s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startA"]/50.0))
+ e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endA"]/50.0))+1
+ avg_a=numpy.average(coverage_data[chrA][s:e])
+ if avg_a > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
+ continue
+ elif (args.max_coverage*n_discordants/avg_a < args.p_ratio/2 and args.max_coverage*n_splits/avg_a < args.r_ratio/2) and not n_contigs:
+ continue
- var_n+=1
+ avg_b=numpy.average(coverage_data[chrA][s:e])
+ if avg_b == 0:
+ continue
+ elif avg_b > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
+ continue
+ elif (args.max_coverage*n_discordants/avg_b < args.p_ratio/2 and args.max_coverage*n_splits/avg_b < args.r_ratio/2) and not n_contigs:
+ continue
- sample_data={}
- for sample in samples:
- coverageA,frac_low_qA,n_discsA,n_splitsA,crossing_f_A,crossing_r_A=get_region(samfile,chrA,sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"],posA,min_mapq,max_ins_len,contig_number)
- coverageB,frac_low_qB,n_discsB,n_splitsB,crossing_f_B,crossing_r_B=get_region(samfile,chrB,sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"],posB,min_mapq,max_ins_len,contig_number)
+ var_n+=1
+ sample_data={}
+ for sample in samples:
- sample_data[sample]={}
- sample_data[sample]={"covA":coverageA,"QA":frac_low_qA,"discA":n_discsA,"splitA":n_splitsA,"refRA":crossing_r_A,"refFA":crossing_f_A}
- sample_data[sample].update({"covB":coverageB,"QB":frac_low_qA,"discB":n_discsB,"splitB":n_splitsB,"refRB":crossing_r_B,"refFB":crossing_f_B})
+ coverageA,frac_low_qA,n_discsA,n_splitsA,crossing_f_A,crossing_r_A=get_region(samfile,chrA,sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"],posA,min_mapq,max_ins_len,contig_number)
+ coverageB,frac_low_qB,n_discsB,n_splitsB,crossing_f_B,crossing_r_B=get_region(samfile,chrB,sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"],posB,min_mapq,max_ins_len,contig_number)
- if chrA != chrB:
- sample_data[sample]["covM"]=0
- elif abs(posB - posA) < 1000:
- if posA < posB:
- coverageM,_,_,_,_,_=get_region(samfile,chrA,posA,posB,posA,min_mapq,max_ins_len,contig_number)
- else:
- coverageM,_,_,_,_,_=get_region(samfile,chrA,posB,posA,posB,min_mapq,max_ins_len,contig_number)
+ sample_data[sample]={}
+ sample_data[sample]={"covA":coverageA,"QA":frac_low_qA,"discA":n_discsA,"splitA":n_splitsA,"refRA":crossing_r_A,"refFA":crossing_f_A}
+ sample_data[sample].update({"covB":coverageB,"QB":frac_low_qA,"discB":n_discsB,"splitB":n_splitsB,"refRB":crossing_r_B,"refFB":crossing_f_B})
- sample_data[sample]["covM"]=coverageM
+ if chrA != chrB:
+ sample_data[sample]["covM"]=0
+ elif abs(posB - posA) < 1000:
+ if posA < posB:
+ coverageM,_,_,_,_,_=get_region(samfile,chrA,posA,posB,posA,min_mapq,max_ins_len,contig_number)
else:
- s=int(math.floor(posA/50.0))
- e=int(math.floor(posB/50.0))+1
- sample_data[sample]["covM"]=numpy.average(coverage_data[chrA][s:e] )
-
- inverted=0
- non_inverted=0
- for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"]) ):
- if sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_discordants"][i]:
- inverted+=1
- else:
- non_inverted+=1
+ coverageM,_,_,_,_,_=get_region(samfile,chrA,posB,posA,posB,min_mapq,max_ins_len,contig_number)
- for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"]) ):
- if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_splits"][i]:
- inverted+=1
- else:
- non_inverted+=1
+ sample_data[sample]["covM"]=coverageM
+ else:
+ s=int(math.floor(posA/50.0))
+ e=int(math.floor(posB/50.0))+1
+ sample_data[sample]["covM"]=numpy.average(coverage_data[chrA][s:e] )
+
+ inverted=0
+ non_inverted=0
+ for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"]) ):
+ if sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_discordants"][i]:
+ inverted+=1
+ else:
+ non_inverted+=1
- for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"]) ):
- if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_contigs"][i]:
- inverted+=1
- else:
- non_inverted+=1
+ for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"]) ):
+ if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_splits"][i]:
+ inverted+=1
+ else:
+ non_inverted+=1
- svtype,cn=find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,library)
+ for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"]) ):
+ if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_contigs"][i]:
+ inverted+=1
+ else:
+ non_inverted+=1
- filt=sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_splits,library,sample_data[sample]["discA"],sample_data[sample]["discB"],sample_data[sample]["splitA"],sample_data[sample]["splitB"],n_contigs)
- format_col="GT:CN:COV:DR:SR:LQ:RR:RD"
+ svtype,cn=find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,library)
- #configure filters for CNV based on Read depth
- for sample in samples:
- if "DEL" in svtype:
- #homozygout del based on coverage
- if cn == 0:
- filt="PASS"
+ filt=sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_splits,library,sample_data[sample]["discA"],sample_data[sample]["discB"],sample_data[sample]["splitA"],sample_data[sample]["splitB"],n_contigs)
+ format_col="GT:CN:COV:DV:RV:LQ:RR:RD"
- covA=sample_data[sample]["covA"]
- covM=sample_data[sample]["covM"]
- covB=sample_data[sample]["covB"]
+ #configure filters for CNV based on Read depth
+ for sample in samples:
+ if "DEL" in svtype:
+ #homozygout del based on coverage
+ if cn == 0:
+ filt="PASS"
- #normal coverage on the flanking regions, abnormal inbetween
- if covA > covM*(cn+0.9) and covB > covM*(cn+0.9):
- filt="PASS"
+ covA=sample_data[sample]["covA"]
+ covM=sample_data[sample]["covM"]
+ covB=sample_data[sample]["covB"]
- #too few reads, but clear RD signal
- elif "DUP" in svtype and filt == "BelowExpectedLinks":
+ #normal coverage on the flanking regions, abnormal inbetween
+ if covA > covM*(cn+0.9) and covB > covM*(cn+0.9):
filt="PASS"
+ #too few reads, but clear RD signal
+ elif "DUP" in svtype and filt == "BelowExpectedLinks":
+ filt="PASS"
- if svtype != "BND":
- info=["SVTYPE={}".format(svtype),"SVLEN={}".format(posB-posA),"END={}".format(posB)]
- alt="<{}>".format(svtype)
- info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
- info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
- info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
- info+=["LFB={},{}".format(sample_data[sample]["discB"],sample_data[sample]["splitB"])]
- info+=["LTE={},{}".format(n_discordants,n_splits)]
+ if svtype != "BND":
+ info=["SVTYPE={}".format(svtype),"SVLEN={}".format(posB-posA),"END={}".format(posB)]
+ alt="<{}>".format(svtype)
- if n_contigs:
- for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
- if "_" in c:
- c=c.split("_")[0]
- ctgs=[ contig_seqs[c] ]
- info+=["CTG={}".format("|".join(ctgs) )]
+ info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
+ info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
+ info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
+ info+=["LFB={},{}".format(sample_data[sample]["discB"],sample_data[sample]["splitB"])]
+ info+=["LTE={},{}".format(n_discordants,n_splits)]
- else:
- info+=["CTG=."]
+ if n_contigs:
+ for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
+ if "_" in c:
+ c=c.split("_")[0]
+ ctgs=[ contig_seqs[c] ]
+ info+=["CTG={}".format("|".join(ctgs) )]
+ else:
+ info+=["CTG=."]
- info=";".join(info)
- variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt,".",filt,info,format_col]
- for sample in samples:
- GT="./."
- if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
- GT="0/1"
- if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
- GT="1/1"
- if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
- GT="1/1"
- if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
- if sample_data[sample]["covB"]:
- if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
- GT="1/1"
- else:
- GT="0/1"
+ info=";".join(info)
+ variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt,".",filt,info,format_col]
+ for sample in samples:
+ GT="./."
+
+ if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
+ GT="0/1"
+ if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
+ GT="1/1"
+ if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
+ GT="1/1"
+ if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
+ if sample_data[sample]["covB"]:
+ if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
+ GT="1/1"
else:
- GT="1/1"
+ GT="0/1"
+ else:
+ GT="1/1"
- if sample_data[sample]["covA"]:
- if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
- GT="1/1"
- else:
- GT="0/1"
- else:
+ if sample_data[sample]["covA"]:
+ if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
GT="1/1"
-
-
- if "DEL" in alt:
- if cn == 0:
- GT = "1/1"
- else:
- GT= "0/1"
- elif "DUP" in alt:
- if cn >= 2*library["contig_ploidy_{}".format(chrA)]:
- GT= "1/1"
else:
GT="0/1"
+ else:
+ GT="1/1"
+
+
+ if "DEL" in alt:
+ if cn == 0:
+ GT = "1/1"
+ else:
+ GT= "0/1"
+ elif "DUP" in alt:
+ if cn >= 2*library["contig_ploidy_{}".format(chrA)]:
+ GT= "1/1"
+ else:
+ GT="0/1"
- variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
- variants[chrA].append([posA,variant])
+ variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
+ variants.append([chrA,posA,variant])
+ else:
+ info=["SVTYPE=BND".format(svtype)]
+ inverted=False
+ before=True
+
+ if posA == sv_clusters[chrA][chrB][cluster]["endA"]:
+ before=False
+
+ if inverted > non_inverted:
+ inverted=True
+
+ if not inverted and not before:
+ alt_str_a="N[{}:{}[".format(chrB,posB)
+ alt_str_b="]{}:{}]N".format(chrA,posA)
+ elif not inverted and before:
+ alt_str_a="]{}:{}]N".format(chrB,posB)
+ alt_str_b="N[{}:{}[".format(chrA,posA)
+ elif inverted and not before:
+ alt_str_a="N]{}:{}]".format(chrB,posB)
+ alt_str_b="[{}:{}[N".format(chrA,posA)
else:
- info=["SVTYPE=BND".format(svtype)]
- inverted=False
- before=True
-
- if posA == sv_clusters[chrA][chrB][cluster]["endA"]:
- before=False
-
- if inverted > non_inverted:
- inverted=True
-
- if not inverted and not before:
- alt_str_a="N[{}:{}[".format(chrB,posB)
- alt_str_b="]{}:{}]N".format(chrA,posA)
- elif not inverted and before:
- alt_str_a="]{}:{}]N".format(chrB,posB)
- alt_str_b="N[{}:{}[".format(chrA,posA)
- elif inverted and not before:
- alt_str_a="N]{}:{}]".format(chrB,posB)
- alt_str_b="[{}:{}[N".format(chrA,posA)
- else:
- alt_str_a="[{}:{}[N".format(chrB,posB)
- alt_str_b="N]{}:{}]".format(chrA,posA)
+ alt_str_a="[{}:{}[N".format(chrB,posB)
+ alt_str_b="N]{}:{}]".format(chrA,posA)
- info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
- info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
- info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
- info+=["LFB={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
- info+=["LTE={},{}".format(n_discordants,n_splits)]
+ info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
+ info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
+ info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
+ info+=["LFB={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
+ info+=["LTE={},{}".format(n_discordants,n_splits)]
- if n_contigs:
- for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
- if "_" in c:
- c=c.split("_")[0]
+ if n_contigs:
+ for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
+ if "_" in c:
+ c=c.split("_")[0]
- ctgs=[ contig_seqs[c] ]
- info+=["CTG={}".format("|".join(ctgs) )]
+ ctgs=[ contig_seqs[c] ]
+ info+=["CTG={}".format("|".join(ctgs) )]
- else:
- info+=["CTG=."]
+ else:
+ info+=["CTG=."]
- info=";".join(info)
- variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt_str_a,".",filt,info,format_col]
- for sample in samples:
- GT="./."
- if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
- GT="0/1"
- if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
- GT="1/1"
- if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
- GT="1/1"
- if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
- if sample_data[sample]["covB"]:
- if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
- GT="1/1"
- else:
- GT="0/1"
+ info=";".join(info)
+ variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt_str_a,".",filt,info,format_col]
+ for sample in samples:
+ GT="./."
+ if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
+ GT="0/1"
+ if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
+ GT="1/1"
+ if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
+ GT="1/1"
+ if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
+ if sample_data[sample]["covB"]:
+ if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
+ GT="1/1"
else:
- GT="1/1"
-
- if sample_data[sample]["covA"]:
- if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
- GT="1/1"
- else:
- GT="0/1"
-
+ GT="0/1"
+ else:
+ GT="1/1"
+ if sample_data[sample]["covA"]:
+ if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
+ GT="1/1"
else:
- GT="1/1"
+ GT="0/1"
+ else:
+ GT="1/1"
- variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
- variants[chrA].append([posA,variant])
- variant=[chrB,str(posB),"SV_{}_2".format(var_n),"N",alt_str_b,".",filt,info,format_col]
- for sample in samples:
- GT="./."
- if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
- GT="0/1"
- if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
- GT="1/1"
- if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
- GT="1/1"
- if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
- if sample_data[sample]["covB"]:
- if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
- GT="1/1"
- else:
- GT="0/1"
- else:
- GT="1/1"
+ variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
+ variants.append([chrA,posA,variant])
- if sample_data[sample]["covA"]:
- if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
- GT="1/1"
- else:
- GT="0/1"
+ variant=[chrB,str(posB),"SV_{}_2".format(var_n),"N",alt_str_b,".",filt,info,format_col]
+ for sample in samples:
+ GT="./."
+ if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
+ GT="0/1"
+ if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
+ GT="1/1"
+ if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
+ GT="1/1"
+ if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
+ if sample_data[sample]["covB"]:
+ if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
+ GT="1/1"
+ else:
+ GT="0/1"
+ else:
+ GT="1/1"
+ if sample_data[sample]["covA"]:
+ if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
+ GT="1/1"
else:
- GT="1/1"
+ GT="0/1"
+ else:
+ GT="1/1"
- variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
- variants[chrB].append([posB,variant])
+ variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
+ variants.append([chrB,posB,variant])
samfile.close()
return(variants)
+
+def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len):
+ contig_seqs={}
+ new_seq=False
+ if not args.skip_assembly:
+ for line in open("{}_tiddit/clips.fa.assembly.clean.mag".format(args.o)):
+
+ if not new_seq and line[0] == "@" and "\t" in line:
+ name=line.split("\t")[0][1:]
+ new_seq=True
+
+ elif new_seq:
+ contig_seqs[name]=line.strip("\n")
+ new_seq=False
+ variants={}
+ for chrA in sv_clusters:
+ variants[chrA]=[]
+ for chrB in sv_clusters[chrA]:
+ variants[chrB]=[]
+
+ variants_list=Parallel(n_jobs=args.threads)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
+
+ for v in variants_list:
+ for variant in v:
+ variants[ variant[0] ].append( [ variant[1],variant[2] ] )
+
+ return(variants)
View it on GitLab: https://salsa.debian.org/med-team/tiddit/-/commit/91690294192730217026f2c1710e063d6b9e0b07
--
View it on GitLab: https://salsa.debian.org/med-team/tiddit/-/commit/91690294192730217026f2c1710e063d6b9e0b07
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220819/d6273407/attachment-0001.htm>
More information about the debian-med-commit
mailing list