[med-svn] [Git][med-team/sniffles][upstream] New upstream version 2.2

Lance Lin (@linqigang) gitlab at salsa.debian.org
Fri Jul 14 16:15:53 BST 2023


Lance Lin pushed to branch upstream at Debian Med / sniffles


Commits:
916a7448 by Lance Lin at 2023-07-14T21:54:54+07:00
New upstream version 2.2
- - - - -


13 changed files:

- .gitignore
- README.md
- setup.cfg
- src/sniffles/cluster.py
- src/sniffles/config.py
- src/sniffles/consensus.py
- src/sniffles/leadprov.py
- src/sniffles/parallel.py
- src/sniffles/postprocessing.py
- src/sniffles/snf.py
- src/sniffles/sniffles
- src/sniffles/sv.py
- src/sniffles/vcf.py


Changes:

=====================================
.gitignore
=====================================
@@ -1,3 +1,5 @@
 .DS_Store
 src/sniffles.egg-info
 dist
+__pycache__
+*.pyc


=====================================
README.md
=====================================
@@ -21,7 +21,7 @@ or
 
 If you previously installed Sniffles1 using conda and want to upgrade to Sniffles2, you can use:
 
-`conda update sniffles=2.0`
+`conda update sniffles=2.2`
 
 ## Requirements
 * Python >= 3.7
@@ -36,7 +36,7 @@ Please cite our paper at:
 
 https://www.nature.com/articles/s41592-018-0001-7
 
-A new preprint for the new methods and improvements introduced with Sniffles2 is here: https://www.biorxiv.org/content/10.1101/2022.04.04.487055v1 
+A new preprint for the new methods and improvements introduced with Sniffles2 is here: https://www.biorxiv.org/content/10.1101/2022.04.04.487055v1
 
 ## Use-Cases / Modes
 
@@ -55,10 +55,10 @@ Multi-sample SV calling using Sniffles2 population mode works in two steps:
 Alternatively, for step 2. you can supply a .tsv file, containing a list of .snf files, and custom sample ids in an optional second column (one sample per line), .e.g.:
 2. Combined calling using a .tsv as sample list: `sniffles --input snf_files_list.tsv --vcf multisample.vcf`
 
-### C. Non-Germline SV Calling (Somatic)
-To call non-germline SVs (i.e. somatic/mosaic) SVs, the *--non-germline* option should be added, i.e.:
+### C. Mosaic SV Calling (Non-germline or somatic SVs)
+To call mosaic SVs, the *--mosaic* option should be added, i.e.:
 
-`sniffles --input mapped_input.bam --vcf output.vcf --non-germline`
+`sniffles --input mapped_input.bam --vcf output.vcf --mosaic`
 
 ### D. Genotyping a known set of SVs (Force Calling)
 Example command, to determine the genotype of each SV in *input_known_svs.vcf* for *sample.bam* and write the re-genotyped SVs to *output_genotypes.vcf*:


=====================================
setup.cfg
=====================================
@@ -1,6 +1,6 @@
 [metadata]
 name = sniffles
-version = 2.0.7
+version = 2.2
 author = Moritz Smolka
 author_email = moritz.g.smolka at gmail.com
 description = A fast structural variation caller for long-read sequencing data


=====================================
src/sniffles/cluster.py
=====================================
@@ -36,6 +36,7 @@ class Cluster:
             self.stdev_start=0
             return
 
+        # REVIEW: might need fix based on issue #407
         step=int(len(self.leads)/n)
         if n>1:
             self.mean_svlen=sum(self.leads[i].svlen for i in range(0,len(self.leads),step))/float(n)
@@ -247,6 +248,12 @@ def resolve(svtype,leadtab_provider,config,tr):
             i=max(0,i-2)
         i+=1
 
+    if config.dev_trace_read:
+        for c in clusters:
+            for ld in c.leads:
+                if ld.read_qname==config.dev_trace_read:
+                    print(f"[DEV_TRACE_READ [2/4] [cluster.resolve] Read lead {ld} is in cluster {c.id}, containing a total of {len(c.leads)} leads")
+
     if config.dev_dump_clusters:
         filename=f"{config.input}.clusters.{svtype}.{leadtab_provider.contig}.{leadtab_provider.start}.{leadtab_provider.end}.bed"
         print(f"Dumping clusters to {filename}")
@@ -265,7 +272,7 @@ def resolve(svtype,leadtab_provider,config,tr):
             if config.dev_no_resplit:
                 yield cluster
             else:
-                for new_cluster in resplit_bnd(cluster,merge_threshold=config.bnd_cluster_resplit):
+                for new_cluster in resplit_bnd(cluster,merge_threshold=config.cluster_merge_bnd):
                     yield new_cluster
         else:
             if svtype=="INS" or svtype=="DEL":


=====================================
src/sniffles/config.py
=====================================
@@ -16,7 +16,7 @@ import argparse
 from sniffles import util
 
 VERSION="Sniffles2"
-BUILD="2.0.7"
+BUILD="2.2"
 SNF_VERSION="S2_rc4"
 
 class ArgFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
@@ -25,9 +25,9 @@ class ArgFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescripti
 def tobool(v):
     if v==True or v==False:
         return v
-    elif v.lower()=="true" or v=="1":
+    elif v.strip().lower()=="true" or v.strip()=="1":
         return True
-    elif v.lower()=="false" or v=="0":
+    elif v.strip().lower()=="false" or v.strip()=="0":
         return False
     else:
         raise argparse.ArgumentTypeError("Boolean value (True | False) required for argument")
@@ -46,8 +46,8 @@ def from_cmdline():
     ... OR, simultaneously producing a single-sample VCF and SNF file for later multi-sample calling:
       sniffles --input sample1.bam --vcf sample1.vcf.gz --snf sample1.snf
 
-    ... OR, with additional options to specify tandem repeat annotations (for improved call accuracy), reference (for DEL sequences) and non-germline mode for detecting rare SVs:
-      sniffles --input sample1.bam --vcf sample1.vcf.gz --tandem-repeats tandem_repeats.bed --reference genome.fa --non-germline
+    ... OR, with additional options to specify tandem repeat annotations (for improved call accuracy), reference (for DEL sequences) and mosaic mode for detecting rare SVs:
+      sniffles --input sample1.bam --vcf sample1.vcf.gz --tandem-repeats tandem_repeats.bed --reference genome.fa --mosaic
 
  Usage example B - Multi-sample calling:
     Step 1. Create .snf for each sample: sniffles --input sample1.bam --snf sample1.snf
@@ -59,7 +59,7 @@ def from_cmdline():
  Usage example C - Determine genotypes for a set of known SVs (force calling):
     sniffles --input sample.bam --genotype-vcf input_known_svs.vcf --vcf output_genotypes.vcf
     """
-    usage="sniffles --input SORTED_INPUT.bam [--vcf OUTPUT.vcf] [--snf MERGEABLE_OUTPUT.snf] [--threads 4] [--non-germline]\n\n" + header + "\n\n" + example + "\n\n Use --help for full parameter/usage information\n \n"
+    usage="sniffles --input SORTED_INPUT.bam [--vcf OUTPUT.vcf] [--snf MERGEABLE_OUTPUT.snf] [--threads 4] [--mosaic]\n\n" + header + "\n\n" + example + "\n\n Use --help for full parameter/usage information\n \n"
     parser = argparse.ArgumentParser(description="", epilog=example, formatter_class=lambda prog: ArgFormatter(prog,max_help_position=100,width=150), usage=usage)
     parser.add_argument("--version", action="version", version=f"Sniffles2, Version {BUILD}")
 
@@ -69,26 +69,27 @@ def from_cmdline():
     main_args.add_argument("--snf", metavar="OUT.snf", type=str, help="Sniffles2 file (.snf) output filename to store candidates for later multi-sample calling", required=False)
     main_args.add_argument("--reference", metavar="reference.fasta", type=str, help="(Optional) Reference sequence the reads were aligned against. To enable output of deletion SV sequences, this parameter must be set.", default=None)
     main_args.add_argument("--tandem-repeats", metavar="IN.bed", type=str, help="(Optional) Input .bed file containing tandem repeat annotations for the reference genome.", default=None)
-    main_args.add_argument("--non-germline", help="Call non-germline SVs (rare, somatic or mosaic SVs)", default=False, action="store_true")
     main_args.add_argument("--phase", help="Determine phase for SV calls (requires the input alignments to be phased)", default=False, action="store_true")
     main_args.add_argument("-t","--threads", metavar="N", type=int, help="Number of parallel threads to use (speed-up for multi-core CPUs)", default=4)
 
     filter_args = parser.add_argument_group("SV Filtering parameters")
     filter_args.add_argument("--minsupport", metavar="auto", type=str, help="Minimum number of supporting reads for a SV to be reported (default: automatically choose based on coverage)", default="auto")
-    filter_args.add_argument("--minsupport-auto-mult", metavar="0.1/0.025", type=float, help="Coverage based minimum support multiplier for germline/non-germline modes (only for auto minsupport) ", default=None)
-    filter_args.add_argument("--minsvlen", metavar="N", type=int, help="Minimum SV length (in bp)", default=35)
+    filter_args.add_argument("--minsupport-auto-mult", metavar="0.1/0.025", type=float, help="Coverage based minimum support multiplier for germline mode (only for auto minsupport) ", default=None)
+    filter_args.add_argument("--minsvlen", metavar="N", type=int, help="Minimum SV length (in bp)", default=50)
     filter_args.add_argument("--minsvlen-screen-ratio", metavar="N", type=float, help="Minimum length for SV candidates (as fraction of --minsvlen)", default=0.9)
-    filter_args.add_argument("--mapq", metavar="N", type=int, help="Alignments with mapping quality lower than this value will be ignored", default=25)
+    filter_args.add_argument("--mapq", metavar="N", type=int, help="Alignments with mapping quality lower than this value will be ignored", default=20)
     filter_args.add_argument("--no-qc", "--qc-output-all", help="Output all SV candidates, disregarding quality control steps.", default=False, action="store_true")
     filter_args.add_argument("--qc-stdev", help="Apply filtering based on SV start position and length standard deviation", metavar="True", type=tobool, default=True)
     filter_args.add_argument("--qc-stdev-abs-max", help="Maximum standard deviation for SV length and size (in bp)", metavar="N", type=int, default=500)
     filter_args.add_argument("--qc-strand", help="Apply filtering based on strand support of SV calls", metavar="False", type=tobool, default=False)
     filter_args.add_argument("--qc-coverage", help="Minimum surrounding region coverage of SV calls", metavar="N", type=int, default=1)
     filter_args.add_argument("--long-ins-length", help="Insertion SVs longer than this value are considered as hard to detect based on the aligner and read length and subjected to more sensitive filtering.", metavar="2500", type=int, default=2500)
-    filter_args.add_argument("--long-del-length", help="Deletion SVs longer than this value are subjected to central coverage drop-based filtering (Not applicable for --non-germline)", metavar="50000", type=int, default=50000)
-    filter_args.add_argument("--long-del-coverage", help="Long deletions with central coverage (in relation to upstream/downstream coverage) higher than this value will be filtered (Not applicable for --non-germline)", metavar="0.66", type=float, default=0.66)
-    filter_args.add_argument("--long-dup-length", help="Duplication SVs longer than this value are subjected to central coverage increase-based filtering (Not applicable for --non-germline)", metavar="50000", type=int, default=50000)
-    filter_args.add_argument("--long-dup-coverage", help="Long duplications with central coverage (in relation to upstream/downstream coverage) lower than this value will be filtered (Not applicable for --non-germline)", metavar="1.33", type=float, default=1.33)
+    filter_args.add_argument("--long-del-length", help="Deletion SVs longer than this value are subjected to central coverage drop-based filtering (Not applicable for --mosaic)", metavar="50000", type=int, default=50000)
+    filter_args.add_argument("--long-del-coverage", help="Long deletions with central coverage (in relation to upstream/downstream coverage) higher than this value will be filtered (Not applicable for --mosaic)", metavar="0.66", type=float, default=0.66)
+    filter_args.add_argument("--long-dup-length", help="Duplication SVs longer than this value are subjected to central coverage increase-based filtering (Not applicable for --mosaic)", metavar="50000", type=int, default=50000)
+    filter_args.add_argument("--qc-bnd-filter-strand", help="Filter breakends that do not have support for both strands", type=tobool, default=True)
+    filter_args.add_argument("--bnd-min-split-length", help="Minimum length of read splits to be considered for breakends", type=int, default=1000)
+    filter_args.add_argument("--long-dup-coverage", help="Long duplications with central coverage (in relation to upstream/downstream coverage) lower than this value will be filtered (Not applicable for --mosaic)", metavar="1.33", type=float, default=1.33)
     filter_args.add_argument("--max-splits-kb", metavar="N", type=float, help="Additional number of splits per kilobase read sequence allowed before reads are ignored", default=0.1)
     filter_args.add_argument("--max-splits-base", metavar="N", type=int, help="Base number of splits allowed before reads are ignored (in addition to --max-splits-kb)", default=3)
     filter_args.add_argument("--min-alignment-length", metavar="N", type=int, help="Reads with alignments shorter than this length (in bp) will be ignored", default=1000)
@@ -103,7 +104,7 @@ def from_cmdline():
     cluster_args.add_argument("--cluster-repeat-h-max", metavar="N", type=float, help="Max. merging distance based on SV length criterion for tandem repeat cluster merging", default=1000)
     cluster_args.add_argument("--cluster-merge-pos", metavar="N", type=int, help="Max. merging distance for insertions and deletions on the same read and cluster in non-repeat regions", default=150)
     cluster_args.add_argument("--cluster-merge-len", metavar="F", type=float, help="Max. size difference for merging SVs as fraction of SV length", default=0.33)
-    cluster_args.add_argument("--cluster-merge-bnd", metavar="N", type=int, help="Max. merging distance for breakend SV candidates.", default=1500)
+    cluster_args.add_argument("--cluster-merge-bnd", metavar="N", type=int, help="Max. merging distance for breakend SV candidates.", default=1000)
 
     genotype_args = parser.add_argument_group("SV Genotyping parameters")
     genotype_args.add_argument("--genotype-ploidy", metavar="N", type=int, help="Sample ploidy (currently fixed at value 2)", default=2)
@@ -119,7 +120,10 @@ def from_cmdline():
     multi_args.add_argument("--combine-match", metavar="N", type=int, help="Multiplier for maximum deviation of multiple SV's start/end position for them to be combined across samples. Given by max_dev=M*sqrt(min(SV_length_a,SV_length_b)), where M is this parameter.", default=250)
     multi_args.add_argument("--combine-match-max", metavar="N", type=int, help="Upper limit for the maximum deviation computed for --combine-match, in bp.", default=1000)
     multi_args.add_argument("--combine-separate-intra", help="Disable combination of SVs within the same sample", default=False, action="store_true")
-    multi_args.add_argument("--combine-output-filtered", help="Include low-confidence / putative non-germline SVs in multi-calling", default=False, action="store_true")
+    multi_args.add_argument("--combine-output-filtered", help="Include low-confidence / mosaic SVs in multi-calling", default=False, action="store_true")
+    multi_args.add_argument("--combine-pair-relabel", help="Override low-quality genotypes when combining 2 samples (may be used for e.g. tumor-normal comparisons)", default=False, action="store_true")
+    multi_args.add_argument("--combine-pair-relabel-threshold", help="Genotype quality below which a genotype call will be relabeled", default=20, type=int)
+    multi_args.add_argument("--combine-close-handles", help="Close .SNF file handles after each use. May lower performance, but may be required when maximum number of file handles supported by OS is reached when merging many samples.", default=False, action="store_true")
     #multi_args.add_argument("--combine-exhaustive", help="(DEV) Disable performance optimization in multi-calling", default=False, action="store_true")
     #multi_args.add_argument("--combine-relabel-rare", help="(DEV)", default=False, action="store_true")
     #multi_args.add_argument("--combine-with-missing", help="(DEV)", default=False, action="store_true")
@@ -134,6 +138,18 @@ def from_cmdline():
     postprocess_args.add_argument("--symbolic", help="Output all SVs as symbolic, including insertions and deletions, instead of reporting nucleotide sequences.", default=False, action="store_true")
     postprocess_args.add_argument("--allow-overwrite", help="Allow overwriting output files if already existing", default=False, action="store_true")
 
+    mosaic_args = parser.add_argument_group("Mosaic calling mode parameters")
+    mosaic_args.add_argument("--mosaic", help="Set Sniffles run mode to detect rare, somatic and mosaic SVs", default=False, action="store_true")
+    mosaic_args.add_argument("--mosaic-af-max", help="Maximum allele frequency for which SVs are considered mosaic", metavar="F", default=0.3, type=float)
+    mosaic_args.add_argument("--mosaic-af-min", help="Minimum allele frequency for mosaic SVs to be output", metavar="F", default=0.05, type=float)
+    mosaic_args.add_argument("--mosaic-qc-invdup-min-length", help="Minimum SV length for mosaic inversion and duplication SVs", metavar="N", default=500, type=int)
+    mosaic_args.add_argument("--mosaic-qc-nm", default=True, action="store_true", help=argparse.SUPPRESS)
+    mosaic_args.add_argument("--mosaic-qc-nm-mult", metavar="F", type=float, default=1.66, help=argparse.SUPPRESS)
+    mosaic_args.add_argument("--mosaic-qc-coverage-max-change-frac", help="Maximum relative coverage change across SV breakpoints", metavar="F", type=float, default=0.1)
+    mosaic_args.add_argument("--mosaic-qc-strand", help="Apply filtering based on strand support of SV calls", metavar="True", type=tobool, default=True)
+    mosaic_args.add_argument("--mosaic-include-germline", help="Report germline SVs as well in mosaic mode", default=False, action="store_true")
+
+
     developer_args = parser.add_argument_group("Developer parameters")
     developer_args.add_argument("--dev-cache", default=False, action="store_true", help=argparse.SUPPRESS)
     developer_args.add_argument("--dev-cache-dir", metavar="PATH", type=str, default=None, help=argparse.SUPPRESS)
@@ -153,12 +169,17 @@ def from_cmdline():
     developer_args.add_argument("--low-memory", default=False, action="store_true", help=argparse.SUPPRESS)
     developer_args.add_argument("--repeat", default=False, action="store_true", help=argparse.SUPPRESS)
     developer_args.add_argument("--qc-nm", default=False, action="store_true", help=argparse.SUPPRESS)
-    developer_args.add_argument("--qc-nm-max", metavar="F", type=float, default=0.2, help=argparse.SUPPRESS)
+    developer_args.add_argument("--qc-nm-mult", metavar="F", type=float, default=1.66, help=argparse.SUPPRESS)
+    developer_args.add_argument("--qc-coverage-max-change-frac", help="Maximum relative coverage change across SV breakpoints", metavar="F", type=float, default=-1)
     developer_args.add_argument("--coverage-updown-bins", metavar="N", type=int, default=5, help=argparse.SUPPRESS)
     developer_args.add_argument("--coverage-shift-bins", metavar="N", type=int, default=3, help=argparse.SUPPRESS)
     developer_args.add_argument("--coverage-shift-bins-min-aln-length", metavar="N", type=int, default=1000, help=argparse.SUPPRESS)
     developer_args.add_argument("--cluster-binsize-combine-mult", metavar="N", type=int, default=5, help=argparse.SUPPRESS)
     developer_args.add_argument("--cluster-resplit-binsize", metavar="N", type=int, default=20, help=argparse.SUPPRESS)
+    developer_args.add_argument("--dev-trace-read", default=False, metavar="read_id", type=str, help=argparse.SUPPRESS)
+    developer_args.add_argument("--dev-split-max-query-distance-mult", metavar="N", type=int, default=5, help=argparse.SUPPRESS)
+
+
     #developer_args.add_argument("--qc-strand", help="(DEV)", default=False, action="store_true")
 
     config=parser.parse_args()
@@ -198,13 +219,7 @@ def from_cmdline():
     config.minsupport_auto_regional_coverage_weight=0.75
 
     if config.minsupport_auto_mult==None:
-        if config.non_germline:
-            config.minsupport_auto_mult=0.025
-        else:
-            config.minsupport_auto_mult=0.1
-
-    if config.non_germline:
-        config.qc_nm=True
+        config.minsupport_auto_mult=0.1
 
     config.coverage_binsize=config.cluster_binsize
     config.coverage_binsize_combine=config.cluster_binsize*config.cluster_binsize_combine_mult
@@ -225,7 +240,6 @@ def from_cmdline():
 
     #BND
     config.bnd_cluster_length=1000
-    config.bnd_cluster_resplit=0
 
     #Genotyping
     config.genotype_format="GT:GQ:DR:DV"
@@ -237,7 +251,6 @@ def from_cmdline():
 
     #SNF
     config.snf_block_size=10**5
-    config.snf_combine_keep_open=True #Keep file handles open during .snf combining (might be an issue if the number of .snf files to merge is very large)
 
     #Combine
     config.combine_exhaustive=False
@@ -255,4 +268,15 @@ def from_cmdline():
 
     config.workdir=os.getcwd()
 
+    #Mosaic
+    if config.mosaic_include_germline:
+        config.mosaic=True
+
+    config.qc_nm_measure=config.qc_nm
+    if config.mosaic:
+        #config.qc_coverage_max_change_frac=config.mosaic_qc_coverage_max_change_frac
+        config.qc_nm_measure=config.qc_nm_measure or config.mosaic_qc_nm
+        #config.qc_nm_mult=config.mosaic_qc_nm_mult
+        #config.qc_strand=config.mosaic_qc_strand
+
     return config


=====================================
src/sniffles/consensus.py
=====================================
@@ -354,6 +354,7 @@ def novel_from_reads(best_lead,other_leads,klen,skip,skip_repetitive,debug=False
                     conseq_new.append("-"*len(buffer))
         conseq="".join(conseq_new)
 
+        # FIXME: can lead to ZeroDivisionError
         if span/float(len(best_lead.seq)) > minspan:
             alignments.append(conseq)
 


=====================================
src/sniffles/leadprov.py
=====================================
@@ -162,6 +162,7 @@ def read_iterindels(read_id,read,contig,config,use_clips,read_nm):
 
     pos_read=0
     pos_ref=read.reference_start
+
     for op,oplength in read.cigartuples:
         add_read,add_ref,event=OPLIST[op]
         if event and oplength >= minsvlen:
@@ -212,6 +213,34 @@ def read_iterindels(read_id,read,contig,config,use_clips,read_nm):
         pos_read+=add_read*oplength
         pos_ref+=add_ref*oplength
 
+def get_cigar_indels(read_id,read,contig,config,use_clips,read_nm):
+    minsvlen=config.minsvlen_screen
+    longinslen=config.long_ins_length/2.0
+    seq_cache_maxlen=config.dev_seq_cache_maxlen
+    qname=read.query_name
+    mapq=read.mapping_quality
+    strand="-" if read.is_reverse else "+"
+    CINS=pysam.CINS
+    CDEL=pysam.CDEL
+    CSOFT_CLIP=pysam.CSOFT_CLIP
+
+    INS_SUM=0
+    DEL_SUM=0
+
+    pos_read=0
+    pos_ref=read.reference_start
+    for op,oplength in read.cigartuples:
+        add_read,add_ref,event=OPLIST[op]
+        if event:
+            if op==CINS:
+                INS_SUM+=oplength
+            elif op==CDEL:
+                DEL_SUM+=oplength
+        pos_read+=add_read*oplength
+        pos_ref+=add_ref*oplength
+
+    return INS_SUM,DEL_SUM
+
 def read_itersplits_bnd(read_id,read,contig,config,read_nm):
     assert(read.is_supplementary)
     #SA:refname,pos,strand,CIGAR,MAPQ,NM
@@ -276,7 +305,7 @@ def read_itersplits_bnd(read_id,read,contig,config,read_nm):
                               split_qry_start+readspan,
                               strand,
                               mapq,
-                              nm/float(readspan+1),
+                              read_nm,
                               "SPLIT_SUP",
                               "?"))
 
@@ -307,10 +336,14 @@ def read_itersplits(read_id,read,contig,config,read_nm):
     #SA:refname,pos,strand,CIGAR,MAPQ,NM
     all_leads=[]
     supps=[part.split(",") for part in read.get_tag("SA").split(";") if len(part)>0]
+    trace_read=config.dev_trace_read!=False and config.dev_trace_read==read.query_name
 
     if len(supps) > config.max_splits_base + config.max_splits_kb*(read.query_length/1000.0):
         return
 
+    if trace_read:
+        print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] passed max_splits check")
+
     #QC on: 18Aug21, HG002.ont.chr22; O.K.
     #cigarl=CIGAR_tolist(read.cigarstring)
     #if read.is_reverse:
@@ -376,7 +409,7 @@ def read_itersplits(read_id,read,contig,config,read_nm):
                               split_qry_start+readspan,
                               strand,
                               mapq,
-                              nm/float(readspan+1),
+                              read_nm,
                               "SPLIT_SUP",
                               "?"))
 
@@ -388,8 +421,28 @@ def read_itersplits(read_id,read,contig,config,read_nm):
         #assert(CIGAR_listreadstart_rev(cigarl)==readstart_rev)
         #End QC
 
+    if trace_read:
+        print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] all_leads: {all_leads}")
+
     sv.classify_splits(read,all_leads,config,contig)
 
+    if trace_read:
+        print(f"[DEV_TRACE_READ] [0c/4] [LeadProvider.read_itersplits] [{read.query_name}] classify_splits(all_leads): {all_leads}")
+
+
+    """
+    if config.dev_trace_read != False:
+        print(read.query_name)
+        if read.query_name == config.dev_trace_read:
+            for lead_i, lead in enumerate(all_leads):
+                for svtype, svstart, arg in lead.svtypes_starts_lens:
+                    min_mapq=min(lead.mapq,all_leads[max(0,lead_i-1)].mapq)
+                    keep=True
+                    if not config.dev_keep_lowqual_splits and min_mapq < config.mapq:
+                        keep=False
+                    print(f"[DEV_TRACE_READ] [REPORT_LEAD_SPLIT] Splits identified from read {read.read_id}")
+    """
+
     for lead_i, lead in enumerate(all_leads):
         for svtype, svstart, arg in lead.svtypes_starts_lens:
             min_mapq=min(lead.mapq,all_leads[max(0,lead_i-1)].mapq)
@@ -488,7 +541,6 @@ class LeadProvider:
         for ld in self.iter_region(bam,contig,start,end):
             ld_contig,ld_ref_start=ld.contig,ld.ref_start
 
-            #TODO: Handle leads overlapping region ends (start/end)
             if contig==ld_contig and ld_ref_start >= start and ld_ref_start < end:
                 pos_leadtab=int(ld_ref_start/ld_binsize)*ld_binsize
                 self.record_lead(ld,pos_leadtab)
@@ -507,13 +559,20 @@ class LeadProvider:
         coverage_shift_bins=self.config.coverage_shift_bins
         coverage_shift_min_aln_len=self.config.coverage_shift_bins_min_aln_length
         long_ins_threshold=self.config.long_ins_length*0.5
-        qc_nm=self.config.qc_nm
+        qc_nm=self.config.qc_nm_measure
         phase=self.config.phase
         advanced_tags=qc_nm or phase
         mapq_min=self.config.mapq
         alen_min=self.config.min_alignment_length
+        nm_sum=0
+        nm_count=0
+        trace_read=self.config.dev_trace_read
 
         for read in bam.fetch(contig,start,end,until_eof=False):
+            if trace_read!=False:
+                if trace_read==read.query_name:
+                    print(f"[DEV_TRACE_READ] [0b/4] [LeadProvider.iter_region] [{contig}:{start}-{end}] [{read.query_name}] has been fetched and is entering pre-filtering")
+
             #if self.read_count % 1000000 == 0:
             #    gc.collect()
             if read.reference_start < start or read.reference_start >= end:
@@ -534,22 +593,48 @@ class LeadProvider:
             if advanced_tags:
                 if qc_nm:
                     if read.has_tag("NM"):
-                        nm=read.get_tag("NM")/float(read.query_alignment_length+1)
+                        nm_raw=read.get_tag("NM")
+                        nm_ratio=read.get_tag("NM")/float(read.query_alignment_length+1)
+                        ins_sum,del_sum=get_cigar_indels(curr_read_id,read,contig,self.config,use_clips,read_nm=nm)
+                        nm_adj=(nm_raw-(ins_sum+del_sum))
+                        nm_adj_ratio=nm_adj/float(read.query_alignment_length+1)
+                        nm=nm_adj_ratio
+                        nm_sum+=nm
+                        nm_count+=1
 
                 if phase:
                     curr_read_id=(self.read_id,str(read.get_tag("HP")) if read.has_tag("HP") else "NULL",str(read.get_tag("PS")) if read.has_tag("PS") else "NULL")
 
+            if trace_read!=False:
+                if trace_read==read.query_name:
+                    print(f"[DEV_TRACE_READ] [0b/4] [LeadProvider.iter_region] [{contig}:{start}-{end}] [{read.query_name}] passed pre-filtering (whole-read), begin to extract leads")
+
             #Extract small indels
             for lead in read_iterindels(curr_read_id,read,contig,self.config,use_clips,read_nm=nm):
+                if trace_read!=False:
+                    if trace_read==read.query_name:
+                        print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_iterindels] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
                 yield lead
 
             #Extract read splits
             if has_sa:
                 if read.is_supplementary:
+                    if trace_read!=False:
+                        if trace_read==read.query_name:
+                            print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits_bnd] [{contig}:{start}-{end}] [{read.query_name}] is entering read_itersplits_bnd")
                     for lead in read_itersplits_bnd(curr_read_id,read,contig,self.config,read_nm=nm):
+                        if trace_read!=False:
+                            if trace_read==read.query_name:
+                                print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits_bnd] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
                         yield lead
                 else:
+                    if trace_read!=False:
+                        if trace_read==read.query_name:
+                            print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits] [{contig}:{start}-{end}] [{read.query_name}] is entering read_itersplits")
                     for lead in read_itersplits(curr_read_id,read,contig,self.config,read_nm=nm):
+                        if trace_read!=False:
+                            if trace_read==read.query_name:
+                                print(f"[DEV_TRACE_READ] [1/4] [leadprov.read_itersplits] [{contig}:{start}-{end}] [{read.query_name}] new lead: {lead}")
                         yield lead
 
             #Record in coverage table
@@ -570,6 +655,10 @@ class LeadProvider:
                 if read_end <= self.end:
                     target_tab[covr_end_bin]=target_tab[covr_end_bin]-1 if covr_end_bin in target_tab else -1
 
+        self.config.average_regional_nm=nm_sum/float(max(1,nm_count))
+        self.config.qc_nm_threshold=self.config.average_regional_nm
+        #print(f"Contig {contig} avg. regional NM={self.config.average_regional_nm}, threshold={self.config.qc_nm_threshold}")
+
 
     def dev_leadtab_filename(self,contig,start,end):
         scriptloc=os.path.dirname(os.path.realpath(sys.argv[0]))


=====================================
src/sniffles/parallel.py
=====================================
@@ -47,6 +47,16 @@ class Task:
         for svtype in sv.TYPES:
             for svcluster in cluster.resolve(svtype,self.lead_provider,config,self.tandem_repeats):
                 for svcall in sv.call_from(svcluster,config,keep_qc_fails,self):
+                    if config.dev_trace_read!=False:
+                        cluster_has_read=False
+                        for ld in svcluster.leads:
+                            if ld.read_qname==config.dev_trace_read:
+                                cluster_has_read=True
+                        if cluster_has_read:
+                            import copy
+                            svcall_copy=copy.deepcopy(svcall)
+                            svcall_copy.postprocess=None
+                            print(f"[DEV_TRACE_READ] [3/4] [Task.call_candidates] Read {config.dev_trace_read} -> Cluster {svcluster.id} -> preliminary SVCall {svcall_copy}")
                     candidates.append(svcall)
 
         self.coverage_average_fwd,self.coverage_average_rev=postprocessing.coverage(candidates,self.lead_provider,config)
@@ -66,6 +76,18 @@ class Task:
             postprocessing.annotate_sv(svcall,config)
 
             svcall.qc=svcall.qc and postprocessing.qc_sv_post_annotate(svcall,config)
+
+            if config.dev_trace_read!=False:
+                cluster_has_read=False
+                for ld in svcall.postprocess.cluster.leads:
+                    if ld.read_qname==config.dev_trace_read:
+                        cluster_has_read=True
+                if cluster_has_read:
+                    import copy
+                    svcall_copy=copy.deepcopy(svcall)
+                    svcall_copy.postprocess=None
+                    print(f"[DEV_TRACE_READ] [4/4] [Task.finalize_candidates] Read {config.dev_trace_read} -> Cluster {svcall.postprocess.cluster.id} -> finalized SVCall, QC={svcall_copy.qc}: {svcall_copy}")
+
             if not keep_qc_fails and not svcall.qc:
                 continue
 
@@ -80,7 +102,7 @@ class Task:
             snf_in.read_header()
             samples_headers_snf[snf_info["internal_id"]]=(snf_info["filename"],snf_in.header,snf_in)
 
-            if not config.snf_combine_keep_open:
+            if config.combine_close_handles:
                 snf_in.close()
 
         svcalls=[]


=====================================
src/sniffles/postprocessing.py
=====================================
@@ -120,6 +120,8 @@ def coverage_fulfill(requests_for_coverage,calls,lead_provider,config):
     return average_coverage_fwd,average_coverage_rev
 
 def qc_sv_support(svcall,coverage_global,config):
+    if config.mosaic:
+        return True
     if config.minsupport == "auto":
         if not qc_support_auto(svcall,coverage_global,config):
             svcall.filter="SUPPORT_MIN"
@@ -165,6 +167,10 @@ def qc_support_const(svcall,config):
     return svcall.support >= config.minsupport
 
 def qc_sv(svcall,config):
+    af=svcall.get_info("AF")
+    af=af if af!=None else 0
+    sv_is_mosaic = af <= config.mosaic_af_max
+
     if config.qc_stdev:
         stdev_pos=svcall.get_info("STDEV_POS")
         if stdev_pos > config.qc_stdev_abs_max:
@@ -187,30 +193,111 @@ def qc_sv(svcall,config):
         svcall.filter="SVLEN_MIN"
         return False
 
+    if svcall.svtype=="BND":
+        if config.qc_bnd_filter_strand and len(set(l.strand for l in svcall.postprocess.cluster.leads))<2:
+            svcall.filter="STRAND"
+            return False
+    elif ((config.mosaic and sv_is_mosaic) and config.mosaic_qc_strand) or (not (config.mosaic and sv_is_mosaic) and config.qc_strand):
+        is_long_ins=(svcall.svtype=="INS" and svcall.svlen >= config.long_ins_length)
+        if not is_long_ins and len(set(l.strand for l in svcall.postprocess.cluster.leads))<2:
+            svcall.filter="STRAND"
+            return False
+
+    if config.mosaic and sv_is_mosaic:
+        if svcall.svtype=="INV" or svcall.svtype=="DUP" and svcall.svlen < config.mosaic_qc_invdup_min_length:
+            svcall.filter="SVLEN_MIN"
+            return False
+
     #if (svcall.coverage_upstream != None and svcall.coverage_upstream < config.qc_coverage) or (svcall.coverage_downstream != None and svcall.coverage_downstream < config.qc_coverage):
     if svcall.svtype != "DEL" and svcall.svtype != "INS" and (svcall.coverage_center != None and svcall.coverage_center < config.qc_coverage):
         svcall.filter="COV_MIN"
         return False
 
-    if svcall.svtype == "DEL" and config.long_del_length != -1 and abs(svcall.svlen) >= config.long_del_length and not config.non_germline:
+    if svcall.svtype == "DEL" and config.long_del_length != -1 and abs(svcall.svlen) >= config.long_del_length and not config.mosaic:
         if svcall.coverage_center != None and svcall.coverage_upstream != None and svcall.coverage_downstream != None and svcall.coverage_center > (svcall.coverage_upstream+svcall.coverage_downstream)/2.0 * config.long_del_coverage:
             svcall.filter="COV_CHANGE"
             return False
     elif svcall.svtype=="INS" and ( (svcall.coverage_upstream != None and svcall.coverage_upstream < config.qc_coverage) or (svcall.coverage_downstream != None and svcall.coverage_downstream < config.qc_coverage)):
         svcall.filter="COV_CHANGE"
         return False
-    elif svcall.svtype == "DUP" and config.long_dup_length != -1 and abs(svcall.svlen) >= config.long_dup_length and not config.non_germline:
+    elif svcall.svtype == "DUP" and config.long_dup_length != -1 and abs(svcall.svlen) >= config.long_dup_length and not config.mosaic:
         if svcall.coverage_center != None and svcall.coverage_upstream != None and svcall.coverage_downstream != None and svcall.coverage_center < (svcall.coverage_upstream+svcall.coverage_downstream)/2.0 * config.long_dup_coverage:
             svcall.filter="COV_CHANGE"
             return False
 
+    qc_coverage_max_change_frac=config.qc_coverage_max_change_frac
+    if config.mosaic and sv_is_mosaic:
+        qc_coverage_max_change_frac=config.mosaic_qc_coverage_max_change_frac
+    if qc_coverage_max_change_frac != -1.0:
+        if svcall.coverage_upstream!=None and svcall.coverage_upstream!=0:
+            u=float(svcall.coverage_upstream)
+        else:
+            u=1.0
+
+        if svcall.coverage_start!=None and svcall.coverage_start!=0:
+            s=float(svcall.coverage_start)
+        else:
+            s=1.0
+
+        if svcall.coverage_center!=None and svcall.coverage_center!=0:
+            c=float(svcall.coverage_center)
+        else:
+            c=1.0
+
+        if svcall.coverage_end!=None and svcall.coverage_end!=0:
+            e=float(svcall.coverage_end)
+        else:
+            e=1.0
+
+        if svcall.coverage_downstream!=None and svcall.coverage_downstream!=0:
+            d=float(svcall.coverage_downstream)
+        else:
+            d=1.0
+
+        if abs(u-s)/max(u,s) > qc_coverage_max_change_frac:
+            svcall.filter="COV_CHANGE_FRAC"
+            return False
+
+        if abs(s-c)/max(s,c) > qc_coverage_max_change_frac:
+            svcall.filter="COV_CHANGE_FRAC"
+            return False
+
+        if abs(c-e)/max(c,e) > qc_coverage_max_change_frac:
+            svcall.filter="COV_CHANGE_FRAC"
+            return False
+
+        if abs(e-d)/max(e,d) > qc_coverage_max_change_frac:
+            svcall.filter="COV_CHANGE_FRAC"
+            return False
+
     return True
 
 def qc_sv_post_annotate(svcall,config):
+    af=svcall.get_info("AF")
+    af=af if af!=None else 0
+    sv_is_mosaic = af <= config.mosaic_af_max
+
     if (len(svcall.genotypes)==0 or (svcall.genotypes[0][0]!="." and svcall.genotypes[0][0]+svcall.genotypes[0][1]<2)) and (svcall.coverage_center != None and svcall.coverage_center < config.qc_coverage):
         svcall.filter="COV_MIN"
         return False
 
+    qc_nm=config.qc_nm
+    qc_nm_threshold=config.qc_nm_threshold*config.qc_nm_mult
+    if config.mosaic and sv_is_mosaic:
+        qc_nm=config.mosaic_qc_nm
+        qc_nm_threshold=config.qc_nm_threshold*config.qc_nm_mult
+    if qc_nm and svcall.nm > qc_nm_threshold and (len(svcall.genotypes)==0 or svcall.genotypes[0][1]==0):
+        svcall.filter="ALN_NM"
+        return False
+
+    if config.mosaic:
+        if sv_is_mosaic and ( af < config.mosaic_af_min or af > config.mosaic_af_max ):
+            svcall.filter="MOSAIC_AF"
+            return False
+        elif not sv_is_mosaic and not config.mosaic_include_germline:
+            svcall.filter="MOSAIC_AF"
+            return False
+
     return True
 
 def binomial_coef(n,k):
@@ -300,8 +387,8 @@ def genotype_sv(svcall,config,phase):
     genotype_z_score = min(60,int((-10) * likelihood_ratio(qz,q1)))
     genotype_quality = min(60,int((-10) * likelihood_ratio(q2,q1)))
 
-    is_long_ins=(svcall.svtype=="INS" and svcall.svlen >= config.long_ins_length)
-    if genotype_z_score < config.genotype_min_z_score and not config.non_germline and not is_long_ins:
+    is_long_ins=(svcall.svtype=="INS" and svcall.svlen >= config.long_ins_length and config.detect_large_ins)
+    if genotype_z_score < config.genotype_min_z_score and not config.mosaic and not is_long_ins:
         if svcall.filter=="PASS":
             svcall.filter="GT"
 


=====================================
src/sniffles/snf.py
=====================================
@@ -24,6 +24,14 @@ class SNFile:
         self.index={}
         self.total_length=0
 
+    def is_open(self):
+        return self.handle!=False
+
+    def open(self):
+        if self.handle!=False:
+            self.close()
+        self.handle=open(self.filename,"rb")
+
     def store(self,svcand):
         block_index=int(svcand.pos/self.config.snf_block_size)*self.config.snf_block_size
         if not block_index in self.blocks:
@@ -77,6 +85,8 @@ class SNFile:
         return pickle.loads(data)
 
     def write_and_index(self):
+        if not self.is_open():
+            self.open()
         offset=0
         for block_id in sorted(self.blocks):
             data=gzip.compress(self.serialize_block(block_id))
@@ -85,8 +95,12 @@ class SNFile:
             self.index[block_id]=(offset,data_len)
             offset+=data_len
             self.total_length+=data_len
+        if self.config.combine_close_handles:
+            self.close()
 
     def read_header(self):
+        if not self.is_open():
+            self.open()
         try:
             header_text=self.handle.readline()
             self.header_length=len(header_text)
@@ -95,13 +109,21 @@ class SNFile:
             print(f"Error when reading SNF header from '{self.filename}': {e}. The file may not be a valid .snf file or could have been corrupted.")
             raise e
         self.index=self.header["index"]
+        if self.config.combine_close_handles:
+            self.close()
 
     def read_blocks(self,contig,block_index):
+        if not self.is_open():
+            self.open()
         block_index=str(block_index)
         if not contig in self.index:
+            if self.config.combine_close_handles:
+                self.close()
             return None
 
         if not block_index in self.index[contig]:
+            if self.config.combine_close_handles:
+                self.close()
             return None
 
         blocks=[]
@@ -112,7 +134,11 @@ class SNFile:
                 blocks.append(self.unserialize_block(data))
             except Exception as e:
                 print(f"Error when reading block '{contig}.{block_index}' from '{self.filename}': {e}. The file may not be a valid .snf file or could have been corrupted.")
+                if self.config.combine_close_handles:
+                    self.close()
                 raise e
+        if self.config.combine_close_handles:
+            self.close()
         return blocks
 
     def get_index(self):
@@ -122,4 +148,6 @@ class SNFile:
         return self.total_length
 
     def close(self):
-        self.handle.close()
+        if self.handle!=False:
+            self.handle.close()
+            self.handle=False


=====================================
src/sniffles/sniffles
=====================================
@@ -20,7 +20,6 @@ import collections
 import math
 import time
 import os
-from pathlib import Path
 import json
 
 import pysam
@@ -82,7 +81,7 @@ def Sniffles2_Main(config,processes):
         util.fatal_error_main(f"Failed to determine run mode from input. Please specify either: A single .bam file - OR - one or more .snf files - OR - a single .tsv file containing a list of .snf files and optional sample ids as input. (supplied were: {list(set(input_ext))})")
 
     if config.mode != "call_sample" and config.snf != None:
-        util.fatal_error_main(f"--snf cannot be used with run mode {mode}")
+        util.fatal_error_main(f"--snf cannot be used with run mode {config.mode}")
 
     if config.vcf == None and config.snf == None:
         util.fatal_error_main("Please specify at least one of: --vcf or --snf for output (both may be used at the same time)")
@@ -185,8 +184,6 @@ def Sniffles2_Main(config,processes):
         if os.path.exists(config.vcf) and not config.allow_overwrite:
             util.fatal_error_main(f"Output file '{config.vcf}' already exists! Use --allow-overwrite to ignore this check and overwrite.")
         else:
-            if not Path(config.vcf).parent.exists():
-                Path(config.vcf).parent.mkdir(parents=True)
             if config.vcf_output_bgz:
                 if not config.sort:
                     util.fatal_error_main(".gz (bgzip) output is only supported with sorting enabled")
@@ -333,7 +330,7 @@ def Sniffles2_Main(config,processes):
                     sample_id,_=os.path.splitext(os.path.basename(input_filename))
             config.snf_input_info.append({"internal_id":snf_internal_id, "sample_id": sample_id, "filename": input_filename})
             snf_internal_id+=1
-            snf_in.handle.close()
+            snf_in.close()
 
         if not config.combine_consensus:
             for info in config.snf_input_info:


=====================================
src/sniffles/sv.py
=====================================
@@ -126,15 +126,9 @@ def call_from(cluster,config,keep_qc_fails,task):
     support_rev=len(leads) - support_fwd
 
     filter="PASS"
-    if config.qc_strand and (support_fwd==0 or support_rev==0):
-        filter="STRAND"
-        qc=False
 
-    if config.qc_nm:
+    if config.qc_nm_measure:
         nm_mean=util.mean(v.nm for v in leads)
-        if nm_mean > config.qc_nm_max:
-            filter="NM"
-            qc=False
     else:
         nm_mean=-1
 
@@ -299,6 +293,20 @@ def call_group(svgroup,config,task):
         if cons_a!=1 and cons_b!=1:
             return None
 
+    if config.combine_pair_relabel:
+        max_gt=(0,0)
+        for sample_id in genotypes:
+            a,b,qual,dr,dv,ps,new_id=genotypes[sample_id]
+            if qual > config.combine_pair_relabel_threshold and a!=".":
+                max_gt=max(max_gt,(a,b))
+
+        if max_gt!=(0,0):
+            for sample_id in genotypes:
+                a,b,qual,dr,dv,ps,new_id=genotypes[sample_id]
+                if qual < config.combine_pair_relabel_threshold and a!=".":
+                    max_a,max_b=max_gt
+                    genotypes[sample_id]=(max_a,max_b,qual,dr,dv,ps,new_id)
+
     svcall_pos=int(util.median(cand.pos for cand in svgroup.candidates))
     svcall_svlen=int(util.median(cand.svlen for cand in svgroup.candidates))
     svcall_alt=first_cand.alt
@@ -351,7 +359,8 @@ def call_group(svgroup,config,task):
 
 def classify_splits(read,leads,config,main_contig):
     minsvlen_screen=config.minsvlen_screen
-    maxsvlen_other=minsvlen_screen*5
+    maxsvlen_other=minsvlen_screen*config.dev_split_max_query_distance_mult
+    min_split_len_bnd=config.bnd_min_split_length
 
     leads.sort(key=lambda ld: ld.qry_start)
     last=leads[0]
@@ -469,7 +478,7 @@ def classify_splits(read,leads,config,main_contig):
             else:
                 a,b=last,curr
 
-            if a.contig == main_contig:
+            if a.contig == main_contig and abs(last.qry_end-last.qry_start) >= min_split_len_bnd and abs(curr.qry_end-curr.qry_start) >= min_split_len_bnd:
                 is_first=a.qry_start < b.qry_start
                 if is_first:
                     if a.strand=="+":


=====================================
src/sniffles/vcf.py
=====================================
@@ -48,7 +48,7 @@ class VCF:
         self.handle=handle
         self.call_count=0
         self.info_order=["SVTYPE","SVLEN","END","SUPPORT","RNAMES","COVERAGE","STRAND"]
-        if config.qc_nm:
+        if config.qc_nm_measure:
             self.info_order.append("NM")
 
         self.default_genotype=config.genotype_none
@@ -96,11 +96,14 @@ class VCF:
         self.write_header_line('FILTER=<ID=STDEV_LEN,Description="SV length standard deviation filter">')
         self.write_header_line('FILTER=<ID=COV_MIN,Description="Minimum coverage filter">')
         self.write_header_line('FILTER=<ID=COV_CHANGE,Description="Coverage change filter">')
+        self.write_header_line('FILTER=<ID=COV_CHANGE_FRAC,Description="Coverage fractional change filter">')
+        self.write_header_line('FILTER=<ID=MOSAIC_AF,Description="Mosaic maximum allele frequency filter">')
+        self.write_header_line('FILTER=<ID=ALN_NM,Description="Length adjusted mismatch filter">')
         self.write_header_line('FILTER=<ID=STRAND,Description="Strand support filter">')
         self.write_header_line('FILTER=<ID=SVLEN_MIN,Description="SV length filter">')
-        self.write_header_line('FILTER=<ID=NM,Description="Alignment noise level filter">')
         self.write_header_line('INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Structural variation with precise breakpoints">')
         self.write_header_line('INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Structural variation with imprecise breakpoints">')
+        self.write_header_line('INFO=<ID=MOSAIC,Number=0,Type=Flag,Description="Structural variation classified as putative mosaic">')
         self.write_header_line('INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of structural variation">')
         self.write_header_line('INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variation">')
         self.write_header_line('INFO=<ID=CHR2,Number=1,Type=String,Description="Mate chromsome for BND SVs">')
@@ -178,6 +181,11 @@ class VCF:
             infos["END"]=None
 
         infos_ordered=["PRECISE" if call.precise else "IMPRECISE"]
+        af=call.get_info("AF")
+        af=af if af!=None else 0
+        sv_is_mosaic = af <= self.config.mosaic_af_max
+        if sv_is_mosaic and self.config.mosaic:
+            infos_ordered.append("MOSAIC")
         infos_ordered.extend(format_info(k,infos[k]) for k in self.info_order if infos[k]!=None)
         info_str=";".join(infos_ordered)
 



View it on GitLab: https://salsa.debian.org/med-team/sniffles/-/commit/916a744849d4c2e49ba085d5aa00597f5f521679

-- 
View it on GitLab: https://salsa.debian.org/med-team/sniffles/-/commit/916a744849d4c2e49ba085d5aa00597f5f521679
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230714/3fede18e/attachment-0001.htm>


More information about the debian-med-commit mailing list