[med-svn] [Git][med-team/libvcflib][upstream] New upstream version 1.0.0+dfsg

Michael R. Crusoe gitlab at salsa.debian.org
Wed Sep 4 04:53:23 BST 2019



Michael R. Crusoe pushed to branch upstream at Debian Med / libvcflib


Commits:
18f9a566 by Michael R. Crusoe at 2019-09-04T00:51:02Z
New upstream version 1.0.0+dfsg
- - - - -


18 changed files:

- Makefile
- README.md
- scripts/vcf2bed.py
- scripts/vcf2sqlite.py
- scripts/vcfclearid
- scripts/vcfclearinfo
- scripts/vcffirstheader
- scripts/vcfnulldotslashdot
- src/BedReader.h
- src/Variant.cpp
- src/Variant.h
- src/cdflib.cpp
- src/split.h
- src/vcfintersect.cpp
- src/vcfnormalizesvs.cpp
- src/vcfroc.cpp
- test/Makefile
- − test/tests/main


Changes:

=====================================
Makefile
=====================================
@@ -128,8 +128,16 @@ LEFTALIGN = smithwaterman/LeftAlign.o
 FSOM = fsom/fsom.o
 FILEVERCMP = filevercmp/filevercmp.o
 
-INCLUDES = -Itabixpp/htslib -I$(INC_DIR) -L. -Ltabixpp/htslib
-LDFLAGS = -L$(LIB_DIR) -lvcflib -lhts -lpthread -lz -lm -llzma -lbz2
+# Work out how to find htslib
+# Use the one we ship in tabixpp unless told otherwise by the environment
+HTS_LIB ?= $(VCF_LIB_LOCAL)/tabixpp/htslib/libhts.a
+HTS_INCLUDES ?= -I$(VCF_LIB_LOCAL)/tabixpp/htslib
+HTS_LDFLAGS ?= -L$(VCF_LIB_LOCAL)/tabixpp/htslib -lhts -lbz2 -lm -lz -llzma -pthread
+
+
+INCLUDES = $(HTS_INCLUDES) -I$(INC_DIR) 
+LDFLAGS = -L$(LIB_DIR) -lvcflib $(HTS_LDFLAGS) -lpthread -lz -lm -llzma -lbz2
+
 
 
 all: $(OBJECTS) $(BINS) scriptToBin
@@ -167,7 +175,7 @@ intervaltree: pre
 	cd intervaltree && $(MAKE) && cp *.h* $(VCF_LIB_LOCAL)/$(INC_DIR)/
 
 $(TABIX): pre
-	cd tabixpp && $(MAKE) && cp *.h* $(VCF_LIB_LOCAL)/$(INC_DIR)/
+	cd tabixpp && INCLUDES="$(HTS_INCLUDES)" LIBPATH="-L. $(HTS_LDFLAGS)" HTSLIB="$(HTS_LIB)" $(MAKE) && cp *.h* $(VCF_LIB_LOCAL)/$(INC_DIR)/
 
 $(SMITHWATERMAN): pre
 	cd smithwaterman && $(MAKE) && cp *.h* $(VCF_LIB_LOCAL)/$(INC_DIR)/ && cp *.o $(VCF_LIB_LOCAL)/$(OBJ_DIR)/
@@ -223,8 +231,14 @@ clean:
 	rm -rf $(LIB_DIR)
 	rm -rf $(INC_DIR)
 	rm -rf $(OBJ_DIR)
-	cd tabixpp && make clean
-	cd smithwaterman && make clean
-	cd fastahack && make clean
-
+	cd tabixpp && $(MAKE) clean
+	cd smithwaterman && $(MAKE) clean
+	cd fastahack && $(MAKE) clean
+	cd multichoose && $(MAKE) clean
+	cd fsom && $(MAKE) clean
+	cd libVCFH && $(MAKE) clean
+	cd test && $(MAKE) clean
+	cd filevercmp && $(MAKE) clean
+	cd intervaltree && $(MAKE) clean
+	
 .PHONY: clean all test pre


=====================================
README.md
=====================================
@@ -144,13 +144,13 @@ This makes installation easy, as users can add vcflib/bin to their path, or copy
 ### vcf2tsv
     
     usage: vcf2tsv [-n null_string] [-g] [vcf file]
-    Converts stdin or given VCF file to tab-delimited format, using null string to replace empty values in the table.
-    Specifying -g will output one line per sample with genotype information.
+Converts stdin or given VCF file to tab-delimited format, using null string to replace empty values in the table.
+Specifying -g will output one line per sample with genotype information.
     
 ### vcfaddinfo
     
     usage: vcfaddinfo <vcf file> <vcf file>
-    Adds info fields from the second file which are not present in the first vcf file.
+Adds info fields from the second file which are not present in the first vcf file.
     
     
 ### vcfafpath
@@ -167,11 +167,9 @@ Uses allele frequencies in the AF info column to estimate phylogeny at multialle
         -t, --tag-parsed FLAG   Tag records which are split apart of a complex allele
                                 with this flag
     
-    If multiple alleleic primitives (gaps or mismatches) are specified in
-    a single VCF record, split the record into multiple lines, but drop all
-    INFO fields.  "Pure" MNPs are split into multiple SNPs unless the -m
-    flag is provided.  Genotypes are phased where complex alleles have been
-    decomposed, provided genotypes in the input.
+If multiple alleleic primitives (gaps or mismatches) are specified in a single VCF record, split the record into multiple lines, but drop all INFO fields.
+"Pure" MNPs are split into multiple SNPs unless the -m flag is provided.
+Genotypes are phased where complex alleles have been decomposed, provided genotypes in the input.
 
     
 ### vcfaltcount
@@ -188,29 +186,25 @@ Counts the number of alternate alleles in the record.
         -k, --key   use this INFO field key for the annotations
         -d, --default  use this INFO field key for records without annotations
 
-    Intersect the records in the VCF file with targets provided in a BED file.
-    Intersections are done on the reference sequences in the VCF file.
-    If no VCF filename is specified on the command line (last argument) the VCF
-    read from stdin.
+Intersect the records in the VCF file with targets provided in a BED file.
+Intersections are done on the reference sequences in the VCF file.
+If no VCF filename is specified on the command line (last argument) the VCF read from stdin.
 
     
 ### vcfannotategenotypes
     
     usage: vcfannotategenotypes <annotation-tag> <vcf file> <vcf file>
 
-    annotates genotypes in the first file with genotypes in the second adding the
-    genotype as another flag to each sample filed in the first file.
-    annotation-tag is the name of the sample flag which is added to store the
-    annotation.  also adds a 'has\_variant' flag for sites where the second file has
-    a variant.
+Annotates genotypes in the first file with genotypes in the second adding the genotype as another flag to each sample filed in the first file.
+Annotation-tag is the name of the sample flag which is added to store the annotation.
+Also adds a 'has\_variant' flag for sites where the second file has a variant.
     
     
 ### vcfbreakmulti
     
     usage: vcfbreakmulti [options] [file]
     
-    If multiple alleles are specified in a single record, break the record into
-    multiple lines, preserving allele-specific INFO fields.
+If multiple alleles are specified in a single record, break the record into multiple lines, preserving allele-specific INFO fields.
     
     
 ### vcfcheck
@@ -220,7 +214,7 @@ Counts the number of alternate alleles in the record.
     options: -f, --fasta-reference  FASTA reference file to use to obtain
                                     primer sequences
     
-    Verifies that the VCF REF field matches the reference as described.
+Verifies that the VCF REF field matches the reference as described.
     
     
     
@@ -239,16 +233,18 @@ reflect positional change.
         -r --region REGION  A region specifier of the form chrN:x-y to bound the merge
 
 Combines VCF files positionally, combining samples when sites and alleles are identical.
-Any number of VCF files may be combined.  The INFO field and other columns are taken from
-one of the files which are combined when records in multiple files match.  Alleles must
-have identical ordering to be combined into one record.  If they do not, multiple records
-will be emitted.
+Any number of VCF files may be combined.
+The INFO field and other columns are taken from one of the files which are combined when records in multiple files match.
+Alleles must
+have identical ordering to be combined into one record.
+If they do not, multiple records will be emitted.
 
 
 ### vcfcommonsamples
     
-    usage: vcfcommonsamples <vcf file> <vcf file> outputs each record in the
-    first file, removing samples not present in the second
+    usage: vcfcommonsamples <vcf file> <vcf file>
+    
+Outputs each record in the first file, removing samples not present in the second.
     
     
 ### vcfcountalleles
@@ -270,11 +266,11 @@ in the file.
     
     usage: vcfentropy [options] <vcf file>
     
-    options: -f, --fasta-reference  FASTA reference file to use to obtain
-primer sequences -w, --window-size      Size of the window over which to
-calculate entropy
+    options:
+        -f, --fasta-reference  FASTA reference file to use to obtain primer sequences
+        -w, --window-size      Size of the window over which to calculate entropy
     
-    Anotates the output VCF file with, for each record, EntropyLeft,
+Anotates the output VCF file with, for each record, EntropyLeft,
 EntropyRight, EntropyCenter, which are the entropies of the sequence of the
 given window size to the left, right, and center  of the record.
     
@@ -299,26 +295,23 @@ given window size to the left, right, and center  of the record.
                               compressed file which has been indexed with tabix.  any number of
                               regions may be specified.
 
-    Filter the specified vcf file using the set of filters.
-    Filters are specified in the form "<ID> <operator> <value>:
-     -f "DP > 10"  # for info fields
-     -g "GT = 1|1" # for genotype fields
-     -f "CpG"  # for 'flag' fields
+Filter the specified vcf file using the set of filters.
+Filters are specified in the form "<ID> <operator> <value>:
+    -f "DP > 10"  # for info fields
+    -g "GT = 1|1" # for genotype fields
+    -f "CpG"  # for 'flag' fields
 
-    Operators can be any of: =, !, <, >, |, &
+Operators can be any of: =, !, <, >, |, &
 
-    Any number of filters may be specified.  They are combined via logical AND
-    unless --or is specified on the command line.  Obtain logical negation through
-    the use of parentheses, e.g. "! ( DP = 10 )"
+Any number of filters may be specified.  They are combined via logical AND unless --or is specified on the command line.
+Obtain logical negation through the use of parentheses, e.g. "! ( DP = 10 )"
 
-    For convenience, you can specify "QUAL" to refer to the quality of the site, even
-    though it does not appear in the INFO fields.
+For convenience, you can specify "QUAL" to refer to the quality of the site, even though it does not appear in the INFO fields.
 
     
 ### vcffixup
 
-Count the allele frequencies across alleles present in each record in the 
-VCF file.  (Similar to vcftools --freq.)
+Count the allele frequencies across alleles present in each record in the VCF file. (Similar to vcftools --freq.)
 
 Uses genotypes from the VCF file to correct AC (alternate allele count), AF
 (alternate allele frequency), NS (number of called), in the VCF records.  For
@@ -334,10 +327,9 @@ was not called as polymorphic.
     
     usage: vcfflatten [file]
     
-    Removes multi-allelic sites by picking the most common alternate.  Requires
-    allele frequency specification 'AF' and use of 'G' and 'A' to specify the
-    fields which vary according to the Allele or Genotype. VCF file may be
-    specified on the command line or piped as stdin.
+Removes multi-allelic sites by picking the most common alternate.
+Requires allele frequency specification 'AF' and use of 'G' and 'A' to specify the fields which vary according to the Allele or Genotype.
+VCF file may be specified on the command line or piped as stdin.
     
     
 ### vcfgeno2haplo
@@ -348,17 +340,16 @@ was not called as polymorphic.
         -w, --window-size N       compare records up to this many bp away (default 30)
         -r, --reference FILE      FASTA reference file, required with -i and -u
     
-    Convert genotype-based phased alleles within --window-size into haplotype alleles.
+Convert genotype-based phased alleles within --window-size into haplotype alleles.
     
     
     
 ### vcfgenotypecompare
     
     usage: vcfgenotypecompare <other-genotype-tag> <vcf file>
-    adds statistics to the INFO field of the vcf file describing the
-    amount of discrepancy between the genotypes (GT) in the vcf file and the
-    genotypes reported in the <other-genotype-tag>.  use this after
-    vcfannotategenotypes to get correspondence statistics for two vcfs.
+    
+Adds statistics to the INFO field of the vcf file describing the amount of discrepancy between the genotypes (GT) in the vcf file and the genotypes reported in the <other-genotype-tag>.
+Use this after vcfannotategenotypes to get correspondence statistics for two vcfs.
     
     
 ### vcfgenotypes
@@ -373,8 +364,8 @@ alleles provided in the call's ALT/REF fields.
     
     options:
         -n, --fix-null-genotypes   only apply to null and partly-null genotypes
-    
-    Set genotypes using the maximum genotype likelihood for each sample.
+
+Set genotypes using the maximum genotype likelihood for each sample.
     
     
     
@@ -411,30 +402,31 @@ Adds a field (id) which contains an allele-specific numerical index.
         -M, --merge-from FROM-TAG
         -T, --merge-to   TO-TAG   merge from FROM-TAG used in the -i file, setting TO-TAG
                                   in the current file.
+
+For bed-vcf intersection, alleles which fall into the targets are retained.
     
-    For bed-vcf intersection, alleles which fall into the targets are retained.
-    
-    For vcf-vcf intersection and union, unify on equivalent alleles within window-size bp
-    as determined by haplotype comparison alleles.
+For vcf-vcf intersection and union, unify on equivalent alleles within window-size bp as determined by haplotype comparison alleles.
     
     
 ### vcfkeepgeno
     
     usage: vcfkeepgeno <vcf file> [FIELD1] [FIELD2] ...
-    outputs each record in the vcf file, removing FORMAT fields not listed
-    on the command line from sample specifications in the output
+    
+Outputs each record in the vcf file, removing FORMAT fields not listed on the command line from sample specifications in the output.
     
     
 ### vcfkeepinfo
     
     usage: vcfkeepinfo <vcf file> [FIELD1] [FIELD2] ...
-    outputs each record in the vcf file, removing INFO fields not listed on the command line
+    
+ Outputs each record in the vcf file, removing INFO fields not listed on the command line.
     
     
 ### vcfkeepsamples
     
     usage: vcfkeepsamples <vcf file> [SAMPLE1] [SAMPLE2] ...
-    outputs each record in the vcf file, removing samples not listed on the command line
+
+Outputs each record in the vcf file, removing samples not listed on the command line.
     
     
 ### vcfleftalign
@@ -453,9 +445,9 @@ alignments.
         -r, --reference FILE  Use this reference as a basis for realignment.
         -w, --window N        Use a window of this many bp when left aligning (150).
 
-    Left-aligns variants in the specified input file or stdin.  Window size is determined
-    dynamically according to the entropy of the regions flanking the indel.  These must have
-    entropy > 1 bit/bp, or be shorter than ~5kb.
+Left-aligns variants in the specified input file or stdin.
+Window size is determined dynamically according to the entropy of the regions flanking the indel.
+These must have entropy > 1 bit/bp, or be shorter than ~5kb.
 
 
 ### vcflength
@@ -495,12 +487,11 @@ Use `vcfallelicprimitives` to decompose records while preserving format.
         -f, --fasta-reference  FASTA reference file to use to obtain primer sequences
         -l, --primer-length    The length of the primer sequences on each side of the variant
     
-    For each VCF record, extract the flanking sequences, and write them to stdout as FASTA
-    records suitable for alignment.  This tool is intended for use in designing validation
-    experiments.  Primers extracted which would flank all of the alleles at multi-allelic
-    sites.  The name of the FASTA "reads" indicates the VCF record which they apply to.
-    The form is >CHROM_POS_LEFT for the 3' primer and >CHROM_POS_RIGHT for the 5' primer,
-    for example:
+For each VCF record, extract the flanking sequences, and write them to stdout as FASTA records suitable for alignment.
+This tool is intended for use in designing validation experiments.
+Primers extracted which would flank all of the alleles at multi-allelic sites.
+The name of the FASTA "reads" indicates the VCF record which they apply to.
+The form is >CHROM_POS_LEFT for the 3' primer and >CHROM_POS_RIGHT for the 5' primer, for example:
     
     >20_233255_LEFT
     CCATTGTATATATAGACCATAATTTCTTTATCCAATCATCTGTTGATGGA
@@ -518,9 +509,9 @@ Use `vcfallelicprimitives` to decompose records while preserving format.
         -s, --scale-by KEY   scale sampling likelihood by this Float info field
         -p, --random-seed N  use this random seed
     
-    Randomly sample sites from an input VCF file, which may be provided as stdin.
-    Scale the sampling probability by the field specified in KEY.  This may be
-    used to provide uniform sampling across allele frequencies, for instance.
+Randomly sample sites from an input VCF file, which may be provided as stdin.
+Scale the sampling probability by the field specified in KEY.
+This may be used to provide uniform sampling across allele frequencies, for instance.
     
     
 ### vcfremap
@@ -539,8 +530,8 @@ Use `vcfallelicprimitives` to decompose records while preserving format.
         -R, --repeat-gap-extend N    penalize non-repeat-unit gaps in repeat sequence
         -a, --adjust-vcf TAG         supply a new cigar as TAG in the output VCF
     
-    For each alternate allele, attempt to realign against the reference with lowered gap open penalty.
-    If realignment is possible, adjust the cigar and reference/alternate alleles.
+For each alternate allele, attempt to realign against the reference with lowered gap open penalty.
+If realignment is possible, adjust the cigar and reference/alternate alleles.
     
     
 ### vcfremoveaberrantgenotypes
@@ -553,7 +544,8 @@ allele observation) for each genotype.
 ### vcfremovesamples
     
     usage: vcfremovesamples <vcf file> [SAMPLE1] [SAMPLE2] ...
-    outputs each record in the vcf file, removing samples listed on the command line
+
+Outputs each record in the vcf file, removing samples listed on the command line.
     
     
 ### vcfroc
@@ -565,17 +557,17 @@ allele observation) for each genotype.
         -w, --window-size N       compare records up to this many bp away (default 30)
         -r, --reference FILE      FASTA reference file
     
-    Generates a pseudo-ROC curve using sensitivity and specificity estimated against
-    a putative truth set.  Thresholding is provided by successive QUAL cutoffs.
+Generates a pseudo-ROC curve using sensitivity and specificity estimated against a putative truth set.
+Thresholding is provided by successive QUAL cutoffs.
     
     
 ### vcfsamplediff
     
     usage: vcfsamplediff <tag> <sample> <sample> [ <sample> ... ] <vcf file>
-    tags each record where the listed sample genotypes differ with <tag>
-    The first sample is assumed to be germline, the second somatic.
-    Each record is tagged with <tag>={germline,somatic,loh} to specify the type of
-    variant given the genotype difference between the two samples.
+    
+Tags each record where the listed sample genotypes differ with <tag>
+The first sample is assumed to be germline, the second somatic.
+Each record is tagged with <tag>={germline,somatic,loh} to specify the type of variant given the genotype difference between the two samples.
     
     
 ### vcfsamplenames
@@ -593,13 +585,10 @@ Prints the names of the samples in the VCF file.
     application: 
         vcfsom -a output.som -f "AF DP ABP" test.vcf >results.vcf
     
-    vcfsomtrains and/or applies a self-organizing map to the input VCF data
-    on stdin, adding two columns for the x and y coordinates of the winning
-    neuron in the network and an optional euclidean distance from a given
-    node (--center).
+vcfsom trains and/or applies a self-organizing map to the input VCF data on stdin, adding two columns for the x and y coordinates of the winning neuron in the network and an optional euclidean distance from a given node (--center).
     
-    If a map is provided via --apply,  map will be applied to input without
-    training.  Automated filtering to an estimated FP rate is 
+If a map is provided via --apply,  map will be applied to input without training.
+Automated filtering to an estimated FP rate is 
     
     options:
     
@@ -642,7 +631,7 @@ Prints the names of the samples in the VCF file.
         -o, --gap-open-penalty N     gap open penalty for SW algorithm
         -e, --gap-extend-penalty N   gap extension penalty for SW algorithm
     
-    Prints statistics about variants in the input VCF file.
+Prints statistics about variants in the input VCF file.
     
     
 ### vcfstreamsort


=====================================
scripts/vcf2bed.py
=====================================
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-
+from __future__ import print_function
 import sys
 
 for line in sys.stdin:
@@ -13,4 +13,4 @@ for line in sys.stdin:
     span = len(fields[3]) # handle multi-base alleles
     end = start + span
     name = fields[2]
-    print "\t".join(map(str, [chrom, start, end, name]))
+    print("\t".join(str(x) for x in [chrom, start, end, name]))


=====================================
scripts/vcf2sqlite.py
=====================================
@@ -1,12 +1,12 @@
 #!/usr/bin/env python
-
+from __future__ import print_function
 import sys
 import re
 import sqlite3
 
 if len(sys.argv) < 2:
-    print "usage", sys.argv[0], " [dbname]"
-    print "reads VCF on stdin, and writes output to a sqlite3 db [dbname]"
+    print("usage {} [dbname]").format(sys.argv[0])
+    print("reads VCF on stdin, and writes output to a sqlite3 db [dbname]")
     exit(1)
 
 dbname = sys.argv[1]
@@ -24,7 +24,7 @@ for line in sys.stdin:
         n = re.search("Number=(.*?),", line)
         t = re.search("Type=(.*?),", line)
         if i and n and t:
-            id = i.groups()[0]
+            iden = i.groups()[0]
             number = n.groups()[0]
             if number == "A":
                 number = -1
@@ -34,8 +34,8 @@ for line in sys.stdin:
             else:
                 number = int(number)
             typestr = t.groups()[0]
-            infotypes[id] = typestr
-            infonumbers[id] = number
+            infotypes[iden] = typestr
+            infonumbers[iden] = number
         else:
             continue
     elif line.startswith('##'):
@@ -78,13 +78,13 @@ conn.execute(tablecmd)
 
 for line in sys.stdin:
     fields = line.split('\t')
-    chrom, pos, id, ref, alts, qual, filter, info = fields[:8]
+    chrom, pos, iden, ref, alts, qual, filt, info = fields[:8]
     alts = alts.split(",")
     altindex = 0
     chrom = "\'" + chrom + "\'"
-    id = "\'" + id + "\'"
+    iden = "\'" + iden + "\'"
     ref = "\'" + ref + "\'"
-    filter = "\'" + filter + "\'"
+    filt = "\'" + filt + "\'"
     for alt in alts:
         alt = "\'" + alt + "\'"
         info_values = {}
@@ -115,7 +115,7 @@ for line in sys.stdin:
                     value = "0"
             ordered_insertion.append(value)
         cmd = "insert into alleles values (" \
-            + ", ".join([chrom, pos, id, ref, alt, qual, filter]) \
+            + ", ".join([chrom, pos, iden, ref, alt, qual, filt]) \
             + ", " \
             + ", ".join(ordered_insertion) + ")"
         conn.execute(cmd)


=====================================
scripts/vcfclearid
=====================================
@@ -1,12 +1,11 @@
 #!/usr/bin/env python
-#
-
+from __future__ import print_function
 import sys
 
 for line in sys.stdin:
     if line.startswith("#"):
-        print line.strip()
+        print(line.strip())
     else:
         fields = line.strip().split("\t")
         fields[2] = "."
-        print "\t".join(fields)
+        print("\t".join(fields))


=====================================
scripts/vcfclearinfo
=====================================
@@ -1,12 +1,11 @@
 #!/usr/bin/env python
-#
-
+from __future__ import print_function
 import sys
 
 for line in sys.stdin:
     if line.startswith("#"):
-        print line.strip()
+        print(line.strip())
     else:
         fields = line.strip().split("\t")
         fields[7] = "."
-        print "\t".join(fields)
+        print("\t".join(fields))


=====================================
scripts/vcffirstheader
=====================================
@@ -1,16 +1,16 @@
 #!/usr/bin/env python
-
+from __future__ import print_function
 import sys
 
 header=True
 for line in sys.stdin:
     if line.startswith('##'):
         if header:
-            print line.strip()
+            print(line.strip())
         continue
     elif line.startswith('#'):
         if header:
-           print line.strip()
+           print(line.strip())
            header=False
         continue
-    print line.strip()
+    print(line.strip())


=====================================
scripts/vcfnulldotslashdot
=====================================
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-
+from __future__ import print_function, division
 import sys
 import math
 
@@ -8,15 +8,15 @@ def multcoeff(n,k): return bincoeff(n+k-1,k)
 
 for line in sys.stdin:
     if line.startswith("#"):
-        print line.strip()
+        print(line.strip())
         continue
     fields = line.strip().split("\t")
     alleles = len(fields[4].split(","))+1
     # assume that we have GT:GL
     # how many genotypes?  assume diploid
-    flatgls = ",".join(map(str,[0]*multcoeff(alleles,2)))
+    flatgls = ",".join([str(x) for x in [0]*multcoeff(alleles,2)])
     for i in range(9, len(fields)):
         if fields[i] == ".":
             fields[i] = "./.:" + flatgls
-    print "\t".join(fields)
+    print("\t".join(fields))
     


=====================================
src/BedReader.h
=====================================
@@ -93,7 +93,7 @@ public:
     bool isOpen(void) { return _isOpen; }
 
     vector<BedTarget> targets;
-    map<string, IntervalTree<BedTarget*> > intervals; // intervals by reference sequence
+    map<string, IntervalTree<size_t, BedTarget*> > intervals; // intervals by reference sequence
 
     vector<BedTarget> entries(void) {
 
@@ -119,20 +119,18 @@ public:
     }
 
     vector<BedTarget*> targetsContained(BedTarget& target) {
-        vector<Interval<BedTarget*> > results;
-        intervals[target.seq].findContained(target.left, target.right, results);
+        vector<Interval<size_t, BedTarget*> > results = intervals[target.seq].findContained(target.left, target.right);
         vector<BedTarget*> contained;
-        for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
+        for (vector<Interval<size_t, BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
             contained.push_back(r->value);
         }
         return contained;
     }
 
     vector<BedTarget*> targetsOverlapping(BedTarget& target) {
-        vector<Interval<BedTarget*> > results;
-        intervals[target.seq].findOverlapping(target.left, target.right, results);
+        vector<Interval<size_t, BedTarget*> > results = intervals[target.seq].findOverlapping(target.left, target.right);
         vector<BedTarget*> overlapping;
-        for (vector<Interval<BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
+        for (vector<Interval<size_t, BedTarget*> >::iterator r = results.begin(); r != results.end(); ++r) {
             overlapping.push_back(r->value);
         }
         return overlapping;
@@ -148,12 +146,12 @@ BedReader(string& fname)
     }
 
     void addTargets(vector<BedTarget>& targets) {
-        map<string, vector<Interval<BedTarget*> > > intervalsBySeq;
+        map<string, vector<Interval<size_t, BedTarget*> > > intervalsBySeq;
         for (vector<BedTarget>::iterator t = targets.begin(); t != targets.end(); ++t) {
-            intervalsBySeq[t->seq].push_back(Interval<BedTarget*>(1 + t->left, t->right, &*t));
+            intervalsBySeq[t->seq].push_back(Interval<size_t, BedTarget*>(1 + t->left, t->right, &*t));
         }
-        for (map<string, vector<Interval<BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
-            intervals[s->first] = IntervalTree<BedTarget*>(s->second);
+        for (map<string, vector<Interval<size_t, BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
+            intervals[s->first] = IntervalTree<size_t, BedTarget*>((vector<Interval<size_t, BedTarget*> >&&)s->second);
         }
     }
 
@@ -161,12 +159,12 @@ BedReader(string& fname)
         file.open(fname.c_str());
         _isOpen = true;
         targets = entries();
-        map<string, vector<Interval<BedTarget*> > > intervalsBySeq;
+        map<string, vector<Interval<size_t, BedTarget*> > > intervalsBySeq;
         for (vector<BedTarget>::iterator t = targets.begin(); t != targets.end(); ++t) {
-            intervalsBySeq[t->seq].push_back(Interval<BedTarget*>(1 + t->left, t->right, &*t));
+            intervalsBySeq[t->seq].push_back(Interval<size_t, BedTarget*>(1 + t->left, t->right, &*t));
         }
-        for (map<string, vector<Interval<BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
-            intervals[s->first] = IntervalTree<BedTarget*>(s->second);
+        for (map<string, vector<Interval<size_t, BedTarget*> > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) {
+            intervals[s->first] = IntervalTree<size_t, BedTarget*>((vector<Interval<size_t, BedTarget*> >&&)s->second);
         }
     }
 


=====================================
src/Variant.cpp
=====================================
@@ -1,9 +1,81 @@
 #include "Variant.h"
 #include <utility>
 
-
 namespace vcflib {
 
+static char rev_arr [26] = {84, 66, 71, 68, 69, 70, 67, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 65,
+                           85, 86, 87, 88, 89, 90};
+
+std::string reverse_complement(const std::string& seq) {
+    // The old implementation of this function forgot to null-terminate its
+    // returned string. This implementation uses heavier-weight C++ stuff that
+    // may be slower but should ensure that that doesn't happen again.
+    
+    if (seq.size() == 0) {
+        return seq;
+    }
+
+    string ret;
+    ret.reserve(seq.size());
+    
+    std::transform(seq.rbegin(), seq.rend(), std::back_inserter(ret), [](char in) -> char {
+        bool lower_case = (in >= 'a' && in <= 'z');
+        if (lower_case) {
+            // Convert to upper case
+            in -= 32;
+        }
+        if (in < 'A' || in > 'Z') {
+            throw std::runtime_error("Out of range character " + std::to_string((uint8_t)in) + " in inverted sequence");
+        }
+        // Compute RC in terms of letter identity, and then lower-case if necessary.
+        return rev_arr[((int) in) - 'A'] + (lower_case ? 32 : 0);
+    });
+    
+    return ret;
+}
+
+std::string toUpper(const std::string& seq) {
+    if (seq.size() == 0) {
+        return seq;
+    }
+
+    string ret;
+    ret.reserve(seq.size());
+    
+    std::transform(seq.begin(), seq.end(), std::back_inserter(ret), [](char in) -> char {
+        // If it's lower-case, bring it down in value to upper-case.
+        return (in >= 'a' && in <= 'z') ? (in - 32) : in;
+    });
+    
+    return ret;
+}
+
+
+bool allATGCN(const string& s, bool allowLowerCase){
+    if (allowLowerCase){
+       for (string::const_iterator i = s.begin(); i != s.end(); ++i){
+            char c = *i;
+            if (c != 'A' && c != 'a' &&
+                c != 'C' && c != 'c' &&
+                c != 'T' && c != 't' &&
+                c != 'G' && c != 'g' &&
+                c != 'N' && c != 'n'){
+                    return false;
+            }
+        }
+    }
+    else{
+        for (string::const_iterator i = s.begin(); i != s.end(); ++i){
+            char c = *i;
+            if (c != 'A' && c != 'C' && c != 'T' && c != 'G' && c != 'N'){
+                return false;
+            }
+        }
+        
+    }
+    return true;
+}
+
 
 
 void Variant::parse(string& line, bool parseSamples) {
@@ -14,6 +86,7 @@ void Variant::parse(string& line, bool parseSamples) {
     format.clear();
     alt.clear();
     alleles.clear();
+    canonical = false;
 
     // #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT [SAMPLE1 .. SAMPLEN]
     vector<string> fields = split(line, '\t');
@@ -112,288 +185,492 @@ void Variant::parse(string& line, bool parseSamples) {
     //return true; // we should be catching exceptions...
 }
 
-pair<Variant, Variant> Variant::convert_to_breakends(FastaReference& fasta_reference){
+bool Variant::hasSVTags() const{
+    bool found_svtype = !getSVTYPE().empty();
+    bool found_len = this->info.find("SVLEN") != this->info.end() || this->info.find("END") != this->info.end() || this->info.find("SPAN") != this->info.end();
 
-    if (is_sv()){
-        Variant f;
-        Variant s;
-
-        // just copy our old variant to keep our evidence tags and such
-        f = *this;
-        s = *this;
-        // First variant gets our position
-        // The second goes at sv end
-        // We need to modify the ALT field
-        // and the SVTYPE
-        // and the name
-        // and the pair info field, if there
-        
-        int prefix = rand();
-        stringstream f_new_alt;
-        stringstream s_new_alt;
-
-        string f_new_ref;
-        string s_new_ref;
+   return found_svtype && found_len;
+}
 
-        string f_new_id;
-        string s_new_id;
 
-        int64_t opos = get_sv_end(1);
+bool Variant::isSymbolicSV() const{
+    
+    bool found_svtype = !getSVTYPE().empty();
+    
+    bool ref_valid = allATGCN(this->ref);
+    bool alts_valid = true;
+    for (auto a : this->alt){
+        if (!allATGCN(a)){
+            alts_valid = false;
+        }
+    }
+    
+    return (!ref_valid || !alts_valid) && (found_svtype);
+}
 
+string Variant::getSVTYPE(int altpos) const{
+    
+    if (altpos > 0){
+        // TODO: Implement multi-alt SVs
+        return "";
+    }
 
-        f_new_ref = (fasta_reference.getSubSequence(this->sequenceName, this->position, 1));
-        s_new_ref = (fasta_reference.getSubSequence(this->sequenceName, opos, 1));
 
-        if (this->info["SVTYPE"][0] == "DEL"){
-            f_new_alt << f_new_ref << "[" << this->sequenceName << ":" << opos << "[";  
-            s_new_alt << s_new_ref << "]" << this->sequenceName << ":" << opos << "]";
+    if (this->info.find("SVTYPE") != this->info.end()){
+        if (altpos >= this->info.at("SVTYPE").size()) {
+            return "";
         }
-        else if (this->info["SVTYPE"][0] == "INS"){
-            f_new_alt << "";
-            s_new_alt <<  "";
-        }
-        else if (this->info["SVTYPE"][0] == "INV"){
-            f_new_alt << "";
-            s_new_alt <<  "";
-        } 
+        return this->info.at("SVTYPE")[altpos];
+    }
 
+    return "";
+};
 
 
-        return make_pair(f, s);
-    }
 
-    else if (this->info.find("SVTYPE") != this->info.end() &&
-               this->info["SVTYPE"][0] == "TRA"){
+int Variant::getMaxReferencePos(){
+    if (this->canonical && this->info.find("END") != this->info.end()) {
+        // We are cannonicalized and must have a correct END
+        
+        int end = 0;
+        for (auto s : this->info.at("END")){
+            // Get the latest one defined.
+            end = max(abs(stoi(s)), end);
+        }
+        // Convert to 0-based.
+        return end - 1;
+        
+    }
 
+    if (!this->isSymbolicSV()){
+        // We don't necessarily have an END, but we don't need one
+        return this->zeroBasedPosition() + this->ref.length() - 1;
     }
     
-    else{
-        cerr << "ERROR: non-SV types cannot be converted to BND format" << endl;
-        exit(999);
+    if (this->canonicalizable()){
+        // We aren't canonical, but we could be.
+        if (this->info.find("END") != this->info.end()){
+            // We have an END; blindly trust it
+            int end = 0;
+            for (auto s : this->info.at("END")){
+                // Get the latest one defined.
+                end = max(abs(stoi(s)), end);
+            }
+            // Convert to 0-based.
+            return end - 1;
+            
+        }
+        else if (this->info.find("SVLEN") != this->info.end()){
+            // There's no endpoint, but we know an SVLEN.
+            // A negative SVLEN means a deletion, so if we find one we can say we delete that much.
+            int deleted = 0;
+            for (auto s : this->info.at("SVLEN")){
+                int alt_len = stoi(s);
+                if (alt_len > 0){
+                    // Not a deletion, so doesn't affect any ref bases
+                    continue;
+                }
+                deleted = max(-alt_len, deleted); 
+            }
+            
+            // The anchoring base at POS gets added in (because it isn't
+            // deleted) but then subtracted out (because we have to do that to
+            // match non-SV deletions). For insertions, deleted is 0 and we
+            // return 0-based POS. Inversions must have an END.
+            return this->zeroBasedPosition() + deleted;
+        }
+        else{
+            cerr << "Warning: insufficient length information for " << *this << endl;
+            return -1;
+        }
     }
-};
-
-bool Variant::is_sv(){
-    return this->info.find("SVTYPE") != this->info.end();
+    else {
+        cerr << "Warning: can't get end of non-canonicalizeable variant " << *this << endl;
+    }
+    return -1;
 }
 
-int64_t Variant::get_sv_len(int pos){
-        int64_t sv_len;
-        if (is_sv()){
-            if (this->info.find("SVLEN") != this->info.end()){
-                int64_t pre_abs = stol(this->info["SVLEN"][pos]); 
-                sv_len = abs(pre_abs);
-                this->info["END"][pos] = to_string(this->position + abs(sv_len));
-            }
-            else if (this->info.find("END") != this->info.end()){
-                int64_t pre_abs = stol(this->info["END"][pos]) - (this->position);
-                sv_len = abs( pre_abs );
-                if (this->info["SVTYPE"][pos] == "DEL"){
-                    sv_len = -1 * sv_len;
-                }
-                this->info["SVLEN"][pos] = sv_len;
 
-                }
-                else{
-                    cerr << "NO SV LENGTH INFO" << endl;
-                    exit(1999);
-                }
 
-            return sv_len;
-        }
-        else {
-            cerr << "NOT A STRUCTURAL VARIANT" << endl;
-            exit(1872);
-        }
-}
 
-int64_t Variant::get_sv_end(int pos){
-    if (is_sv()){
-        int64_t slen = get_sv_len(pos);
-        if (this->info["SVTYPE"][0] == "DEL"){
-            return (this->position - slen);
-        }
-        else{
-            return (this->position + slen);
+// To canonicalize a variant, we need either both REF and ALT seqs filled in
+// or SVTYPE and SVLEN or END or SPAN or SEQ sufficient to define the variant.
+bool Variant::canonicalizable(){
+    bool pre_canon = allATGCN(this->ref);
+    
+    for (auto& a : this->alt){
+        if (!allATGCN(a)){
+            pre_canon = false;
         }
     }
+    
+    if (pre_canon){
+        // It came in in a fully specified way.
+        // TODO: ideally, we'd check to make sure ref/alt lengths, svtypes, and ends line up right here.
+        return true;
+    }
+    
+    string svtype = getSVTYPE();
+    
+    if (svtype.empty()){
+        // We have no SV type, so we can't interpret things.
+        return false;
+    }
+    
+    // Check the tags
+    bool has_len = this->info.count("SVLEN") && !this->info.at("SVLEN").empty();
+    bool has_seq = this->info.count("SEQ") && !this->info.at("SEQ").empty();
+    bool has_span = this->info.count("SPAN") && !this->info.at("SPAN").empty();
+    bool has_end = this->info.count("END") && !this->info.at("END").empty();
+    
+    
+    if (svtype == "INS"){
+        // Insertions need a SEQ, SVLEN, or SPAN
+        return has_seq || has_len || has_span;
+    }
+    else if (svtype == "DEL"){
+        // Deletions need an SVLEN, SPAN, or END
+        return has_len || has_span || has_end;
+    }
+    else if (svtype == "INV"){
+        // Inversions need a SPAN or END
+        return has_span || has_end;
+    }
     else{
-        cerr << "VARIANT MUST BE SV" << endl;
-        exit(99829);
+        // Other SV types are unsupported
+        // TODO: DUP
+        return false;
     }
 }
 
-bool Variant::canonicalize_sv(FastaReference& fasta_reference, vector<FastaReference*> insertions, bool place_seq, int max_interval){
+bool Variant::canonicalize(FastaReference& fasta_reference, vector<FastaReference*> insertions, bool place_seq, int min_size){
     
-            bool variant_acceptable = true;
-            bool do_external_insertions = !insertions.empty();
-            int32_t sv_len = 0;
-            bool var_is_sv = false;
-
-            FastaReference* insertion_fasta;
-
-#ifdef DEBUG
-            if (do_external_insertions){
-                insertion_fasta = insertions[0];
-                for (auto x : insertion_fasta->index->sequenceNames){
-                    cerr << x << endl;
-                }
+    // Nobody should call this without checking
+    assert(canonicalizable());
+    
+    // Nobody should call this twice
+    assert(!this->canonical);
+    
+    // Find where the inserted sequence can come from for insertions
+    bool do_external_insertions = !insertions.empty();
+    FastaReference* insertion_fasta;
+    if (do_external_insertions){
+        insertion_fasta = insertions[0];
+    }
+    
+    bool ref_valid = allATGCN(ref);
+    
+    if (!ref_valid && !place_seq){
+        // If the reference is invalid, and we aren't allowed to change the ref sequence,
+        // we can't canonicalize the variant.
+        return false;
+    }
+    
+    // Check the alts to see if they are not symbolic
+    vector<bool> alt_i_atgcn (alt.size());
+    for (int i = 0; i < alt.size(); ++i){
+        alt_i_atgcn[i] = allATGCN(alt[i]);
+    }
+    
+    // Only allow single-alt variants
+    bool single_alt = alt.size() == 1;
+    if (!single_alt){
+        // TODO: this will need to be remove before supporting multiple alleles
+        cerr << "Warning: multiple ALT alleles not yet allowed for SVs" << endl;
+        return false;
+    }
+    
+    // Fill in the SV tags
+    string svtype = getSVTYPE();
+    bool has_len = this->info.count("SVLEN") && !this->info.at("SVLEN").empty();
+    bool has_seq = this->info.count("SEQ") && !this->info.at("SEQ").empty();
+    bool has_span = this->info.count("SPAN") && !this->info.at("SPAN").empty();
+    bool has_end = this->info.count("END") && !this->info.at("END").empty();
+    
+    // Where is the end, or where should it be?
+    long info_end = 0;
+    if (has_end) {
+        // Get the END from the tag
+        info_end = stol(this->info.at("END")[0]);
+    }
+    else if(ref_valid && !place_seq) {
+        // Get the END from the reference sequence, which is ready.
+        info_end = this->position + this->ref.length() - 1;
+    }
+    else if ((svtype == "DEL" || svtype == "INV") && has_span) {
+        // For deletions and inversions, we can get the END from the SPAN
+        info_end = this->position + abs(stol(this->info.at("SPAN")[0]));
+    }
+    else if (svtype == "DEL" && has_len) {
+        // For deletions, we can get the END from the SVLEN
+        info_end = this->position + abs(stol(this->info.at("SVLEN")[0]));
+    }
+    else if (svtype == "INS"){
+        // For insertions, END is just POS if not specified
+        info_end = this->position;
+    }
+    else{
+        cerr << "Warning: could not set END info " << *this << endl;
+        return false;
+    }
+    
+    // Commit back the END
+    this->info["END"].resize(1);
+    this->info["END"][0] = to_string(info_end);
+    has_end = true;
+    
+    // What is the variant length change?
+    // We store it as absolute value
+    long info_len = 0;
+    if (has_len){
+        // Get the SVLEN from the tag
+        info_len = abs(stol(this->info.at("SVLEN")[0]));
+    }
+    else if ((svtype == "INS" || svtype == "DEL") && has_span){
+        info_len = abs(stol(this->info.at("SPAN")[0]));
+    }
+    else if (svtype == "DEL"){
+        // We always have the end by now
+        // Deletion ends give you length change
+        info_len = info_end - this->position;
+    } 
+    else if (svtype == "INV"){
+        // Inversions have 0 length change unless otherwise specified.
+        info_len = 0;
+    }
+    else if (svtype == "INS" && has_seq) {
+        // Insertions can let us pick it up from the SEQ tag
+        info_len = this->info.at("SEQ").at(0).size();
+    }
+    else{
+        cerr << "Warning: could not set SVLEN info " << *this << endl;
+        return false;
+    }
+    
+    // Commit the SVLEN back
+    if (svtype == "DEL"){
+        // Should be saved as negative
+        this->info["SVLEN"].resize(1);
+        this->info["SVLEN"][0] = to_string(-info_len);
+    }
+    else{
+        // Should be saved as positive
+        this->info["SVLEN"].resize(1);
+        this->info["SVLEN"][0] = to_string(info_len);
+    }
+    // Now the length change is known
+    has_len = true;
+    
+    // We also compute a span
+    long info_span = 0;
+    if (has_span){
+        // Use the specified span
+        info_span = abs(stol(this->info.at("SVLEN")[0]));
+    }
+    else if (svtype == "INS" || svtype == "DEL"){
+        // has_len is always true here
+        // Insertions and deletions let us determine the span from the length change, unless they are complex.
+        info_span = info_len;
+    }
+    else if (svtype == "INV"){
+        // has_end is always true here
+        // Inversion span is start to end
+        info_span = info_end - this->position;
+    }
+    else{
+        cerr << "Warning: could not set SPAN info " << *this << endl;
+        return false;
+    }
+    
+    // Commit the SPAN back
+    this->info["SPAN"].resize(1);
+    this->info["SPAN"][0] = to_string(info_span);
+    // Now the span change is known
+    has_span = true;
+    
+    if (info_end < this->position) {
+        cerr << "Warning: SV END is before POS [canonicalize] " <<
+        *this << endl << "END: " << info_end << "  " << "POS: " << this->position << endl;
+        return false;
+    }
+    
+    if (has_seq) {
+        // Force the SEQ to upper case, if already present
+        this->info["SEQ"].resize(1);
+        this->info["SEQ"][0] = toUpper(this->info["SEQ"][0]);
+    }
+     
+    // Set the other necessary SV Tags (SVTYPE, SEQ (if insertion))
+    // Also check for agreement in the position tags
+    if (svtype == "INS"){
+        if (info_end != this->position){
+            cerr << "Warning: insertion END and POS do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
+            *this << endl << "END: " << info_end << "  " << "POS: " << this->position << endl;
+            
+            if (info_end == this->position + info_len) {
+                // We can probably guess what they meant here.
+                cerr << "Warning: VCF writer incorrecty produced END = POS + SVLEN for an insertion. Fixing END to POS." << endl;
+                info_end = this->position;
+                this->info["END"][0] = to_string(info_end);
+            } else {
+                return false;
+            }
+        }
+        
+        if (info_len != info_span){
+            cerr << "Warning: insertion SVLEN and SPAN do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
+            *this << endl << "SVLEN: " << info_len << "  " << "SPAN: " << info_span << endl;
+            return false;
+        }
+       
+        if (has_seq && allATGCN(this->info.at("SEQ")[0]) && this->info.at("SEQ")[0].size() != info_len){
+            cerr << "Warning: insertion SVLEN and SEQ do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
+            *this << endl << "SVLEN: " << info_len << "  " << "SEQ length: " << this->info.at("SEQ")[0].size() << endl;
+            return false;
+        }
+       
+        // Set REF
+        string ref_base = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), 1));
+        if (place_seq){
+            this->ref.assign(ref_base);
+        }
+
+        if (has_seq &&
+                 alt[0] != this->info.at("SEQ")[0] &&
+                 allATGCN(this->info.at("SEQ")[0])){
+            // Try to remove prepended ref sequence, assuming it's left-aligned
+            string s = this->alt[0];
+            s = toUpper(s.substr(this->ref.length()));
+            if (s != this->info.at("SEQ")[0] && !place_seq){
+                cerr << "Warning: INS sequence in alt field does not match SEQ tag" << endl <<
+                this->alt[0] << " " << this->info.at("SEQ")[0] << endl;
+                return false;
             }
-#endif
+            if (place_seq){
+                this->alt[0].assign( ref_base + this->info.at("SEQ")[0] );
+            }
+            
+        }
+        else if (alt_i_atgcn[0] && !has_seq){
+            string s = this->alt[0];
+            s = toUpper(s.substr(this->ref.length()));
+            this->info["SEQ"].resize(1);
+            this->info.at("SEQ")[0].assign(s);
+            
+            if (s.size() != info_len){
+                cerr << "Warning: insertion SVLEN and added bases do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
+                *this << endl << "SVLEN: " << info_len << "  " << "added bases: " << s.size() << endl;
+                return false;
+            }
+            
+        }
+        else if (alt[0][0] == '<' && do_external_insertions){
 
+            string ins_seq;
+            string seq_id = alt[0].substr(1, alt[0].size() - 2);
 
-    std::function<bool(const string&)> allATGC = [](const string& s){
-    for (string::const_iterator c = s.begin(); c != s.end(); ++c) {
-        char b = *c;
-        if (b != 'A' && b != 'T' && b != 'G' && b != 'C') {
+            if (insertion_fasta->index->find(seq_id) != insertion_fasta->index->end()){
+                ins_seq = toUpper(insertion_fasta->getSequence(seq_id));
+                if (allATGCN(ins_seq)){
+                    this->info["SEQ"].resize(1);
+                    this->info["SEQ"][0].assign(ins_seq);
+                    if (place_seq){
+                        this->alt[0].assign(ref_base + ins_seq);
+                    }
+                }
+                else {
+                    cerr << "Warning: Loaded invalid alt sequence for: " << *this << endl;
+                    return false;    
+                }
+                
+                if (ins_seq.size() != info_len){
+                    cerr << "Warning: insertion SVLEN and FASTA do not agree (complex insertions not canonicalizeable) [canonicalize] " <<
+                    *this << endl << "SVLEN: " << info_len << "  " << "FASTA bases: " << ins_seq.size() << endl;
+                    return false;
+                }
+            } 
+            else{
+                cerr << "Warning: could not locate alt sequence for: " << *this << endl;
+                return false;
+            }
+            
+        }
+        else{
+            cerr << "Warning: could not set SEQ [canonicalize]. " << *this << endl;
             return false;
         }
     }
-    return true;
-    };
-
-
-    std::function<string(const string&)> revcomp = [](const string& s){
-
-        int len = s.length();
-        char* ret = new char[len];
-        char rev_arr [26] = {84, 66, 71, 68, 69, 70, 67, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 65,
-                        85, 86, 87, 88, 89, 90};
-
-        for (int i = len - 1; i >=0; i--){
-            ret[ len - 1 - i ] = (char) rev_arr[ (int) s[i] - 65];
+    else if (svtype == "DEL"){
+        if (this->position + info_span != info_end){
+            cerr << "Warning: deletion END and SVLEN do not agree [canonicalize] " << *this << endl <<
+            "END: " << info_end << "  " << "SVLEN: " << -info_len << endl;
+            return false;
         }
-
-        return string(ret, len);
-        
-
-    };
-
-        if (!this->is_sv()){
+    
+        if (this->position + info_span != info_end){
+            cerr << "Warning: deletion END and SPAN do not agree [canonicalize] " << *this << endl <<
+            "END: " << info_end << "  " << "SPAN: " << info_span << endl;
             return false;
         }
+    
+        // Set REF
+        if (place_seq){
+            string del_seq = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), info_len + 1));
+            string ref_base = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), 1));
+            this->ref.assign( del_seq );
+            this->alt[0].assign( ref_base );
+        }
+    }
+    else if (svtype == "INV"){
+        if (this->position + info_span != info_end){
+            cerr << "Warning: inversion END and SPAN do not agree [canonicalize] " << *this << endl <<
+            "END: " << info_end << "  " << "SPAN: " << info_span << endl;
+            return false;
+        }
+        
+        if (info_len != 0){
+            cerr << "Warning: inversion SVLEN specifies nonzero length change (complex inversions not canonicalizeable) [canonicalize] " <<
+            *this << endl << "SVLEN: " << info_len << endl;
+            
+            if (info_end == this->position + info_len) {
+                // We can probably guess what they meant here.
+                cerr << "Warning: VCF writer incorrecty produced END = POS + SVLEN for an inversion. Fixing SVLEN to 0." << endl;
+                info_len = 0;
+                this->info["SVLEN"][0] = to_string(info_len);
+            } else {
+                return false;
+            }
+        }
+    
+        if (place_seq){
+            string ref_seq = toUpper(fasta_reference.getSubSequence(this->sequenceName, this->zeroBasedPosition(), info_span + 1));
+            // Note that inversions still need an anchoring left base at POS
+            string inv_seq = ref_seq.substr(0, 1) + reverse_complement(ref_seq.substr(1));
+            this->ref.assign(ref_seq);
+            this->alt[0].assign(inv_seq);
+        }
 
+    }
+    else{
+        cerr << "Warning: invalid SV type [canonicalize]:" << *this << endl;
+        return false;
+    }
 
-                if (this->info.find("SVTYPE") != this->info.end() &&
-                        !(this->info["SVTYPE"].empty())){
-
-                    var_is_sv = true;
-
-                    for (int alt_pos = 0; alt_pos < this->alt.size(); ++alt_pos) {
-                        string a = this->alt[alt_pos];
-                            // These should be normalized-ish
-                            // Ref field might be "N"
-                            // Alt field could be <INS>, but it *should* be the inserted sequence,
-                            // but that might be tucked away in an info field
-                            //
-                            //
-                            // From the VCF4.2 Specs:
-                            // END is POS + LEN(REF) - 1
-                            // SVLEN is the difference in length between REF and ALT alleles.
-
-
-                        if (!(this->info["SVTYPE"][alt_pos] == "INV" ||
-                                        this->info["SVTYPE"][alt_pos] == "DEL" ||
-                                        this->info["SVTYPE"][alt_pos] == "INS") || this->alt.size() > 1){
-                                variant_acceptable = false;
-                                break;                                                                                                                                  
-                        }
-
-
-                        sv_len = get_sv_len(alt_pos);
-
-
-                        /**if (this->info.find("SVLEN") != this->info.end()){
-                            int32_t pre_abs = stoi(this->info["SVLEN"][alt_pos]); 
-                            sv_len = abs(pre_abs);
-                            this->info["END"][alt_pos] = this->position + sv_len;
-                        }
-                        else if (this->info.find("END") != this->info.end()){
-                            int32_t pre_abs = stoi(this->info["END"][alt_pos]) - (this->position);
-                            sv_len = abs( pre_abs );
-                            this->info["SVLEN"][alt_pos] = sv_len;
-
-                        }
-                        else{
-                            // If we have neither, we'll ignore it.
-                            variant_acceptable = false;
-                            break;
-                        }
-                        **/
-                        if (place_seq && (this->info["SVTYPE"][alt_pos] == "INS" || a == "<INS>")){
-                            this->ref.assign(fasta_reference.getSubSequence(this->sequenceName, this->position, 1));
-                            //if (this->alt[alt_pos] == "<INS>"){
-                            //    variant_acceptable = false;
-                            //}
-                            if (do_external_insertions){
-                                insertion_fasta = insertions[0];
-                                string var_name;
-                                if (alt[alt_pos][0] == '<' && alt[alt_pos][ alt[alt_pos].size() - 1 ] == '>'){
-                                    string shortname (alt[alt_pos], 1, alt[alt_pos].length() - 2 );
-                                    var_name = shortname;
-                                }
-                                else{
-                                    var_name = alt[alt_pos]; 
-                                }
-                                #ifdef DEBUG
-                                    cerr << "My variant name is: " << var_name << endl;
-                                #endif
-                                if (insertion_fasta->index->find(var_name) != insertion_fasta->index->end()){
-                                    this->ref.assign(fasta_reference.getSubSequence(this->sequenceName, this->position, 1 ));
-                                    #ifdef DEBUG
-                                        cerr << "Replacing insertion with sequence of " << var_name << endl;
-                                    #endif
-                                    this->alt[alt_pos] = fasta_reference.getSubSequence(this->sequenceName, this->position, 1) + insertion_fasta->getSequence(var_name);
-                                    this->updateAlleleIndexes();
-                                }
-                            }
-                            else if (allATGC(a)){
-                                // we don't have to do anything - our INS is already in the correct format!
-                            }
-                            else{
-                                variant_acceptable = false;
-                            }
-                        }
-                        else if (place_seq && (a == "<DEL>" || this->info["SVTYPE"][alt_pos] == "DEL")){
-
-
-                            this->ref.assign(fasta_reference.getSubSequence(this->sequenceName, this->position, (-1 * sv_len) + 1 ));
-
-                            this->alt[alt_pos].assign(fasta_reference.getSubSequence(this->sequenceName, this->position, 1));
-
-                            if (this->ref.size() != (-1 * sv_len) + 1){
-                                cerr << "Variant made is incorrect size" << endl;
-                                cerr << this->ref.size() - 1 << "\t" << sv_len << endl;
-                                cerr << this->ref[this->ref.size() - 1] << "\t" << endl;
-                                variant_acceptable = false;
-                            }
-                            this->updateAlleleIndexes();
-
-                        }
-                        else if (place_seq && (a == "<INV>" || this->info["SVTYPE"][alt_pos] == "INV")){
-                            this->ref = fasta_reference.getSubSequence(this->sequenceName, this->position, sv_len);
-                            string alt_str(fasta_reference.getSubSequence(this->sequenceName, this->position, sv_len));
-                            alt_str = revcomp(alt_str);
-                            this->alt[alt_pos] = alt_str;
-
-                            // add 3 bases padding to right side 
-                            //this->ref.insert(0, reference.getSubSequence(this->sequenceName, this->position - 3, 3));
-                            //this->alt[alt_pos].insert(0, reference.getSubSequence(this->sequenceName, this->position - 3, 3));
-                            //this->position = this->position - 3;
-                            this->updateAlleleIndexes();
-
-                            //variant_acceptable = false;
 
-                        }
-                        else{
-                            variant_acceptable = false;
-                        }
-                    }
+    this->updateAlleleIndexes();
 
-                }
+    // Check for harmony between ref / alt / tags
+    if (this->position > stol(this->info.at("END").at(0))){
+        cerr << "Warning: position > END. Possible reference genome mismatch." << endl;
+        return false;
+    }
 
+    if (svtype == "INS"){
+        assert(!this->info.at("SEQ")[0].empty());
+    }
 
-    return variant_acceptable;
+    this->canonical = true;
+    return true;
 }
 
 void Variant::setVariantCallFile(VariantCallFile& v) {
@@ -443,7 +720,7 @@ VariantFieldType typeStrToVariantFieldType(string& typeStr) {
     }
 }
 
-VariantFieldType Variant::infoType(string& key) {
+VariantFieldType Variant::infoType(const string& key) {
     map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
     if (s == vcf->infoTypes.end()) {
         if (key == "FILTER") { // hack to use FILTER as an "info" field (why the hack?)
@@ -459,7 +736,7 @@ VariantFieldType Variant::infoType(string& key) {
     }
 }
 
-    VariantFieldType Variant::formatType(string& key) {
+    VariantFieldType Variant::formatType(const string& key) {
         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
         if (s == vcf->formatTypes.end()) {
             cerr << "no format field " << key << endl;
@@ -469,7 +746,7 @@ VariantFieldType Variant::infoType(string& key) {
         }
     }
 
-    bool Variant::getInfoValueBool(string& key, int index) {
+    bool Variant::getInfoValueBool(const string& key, int index) {
         map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
         if (s == vcf->infoTypes.end()) {
             cerr << "no info field " << key << endl;
@@ -497,11 +774,12 @@ VariantFieldType Variant::infoType(string& key) {
                     return true;
             } else {
                 cerr << "not flag type " << key << endl;
+                exit(1);
             }
         }
     }
 
-    string Variant::getInfoValueString(string& key, int index) {
+    string Variant::getInfoValueString(const string& key, int index) {
         map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
         if (s == vcf->infoTypes.end()) {
             if (key == "FILTER") {
@@ -536,7 +814,7 @@ VariantFieldType Variant::infoType(string& key) {
         }
     }
 
-    double Variant::getInfoValueFloat(string& key, int index) {
+    double Variant::getInfoValueFloat(const string& key, int index) {
         map<string, VariantFieldType>::iterator s = vcf->infoTypes.find(key);
         if (s == vcf->infoTypes.end()) {
             if (key == "QUAL") {
@@ -593,7 +871,7 @@ VariantFieldType Variant::infoType(string& key) {
         return valid_genotypes;
     }
 
-    bool Variant::getSampleValueBool(string& key, string& sample, int index) {
+    bool Variant::getSampleValueBool(const string& key, string& sample, int index) {
         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
         if (s == vcf->infoTypes.end()) {
             cerr << "no info field " << key << endl;
@@ -622,11 +900,12 @@ VariantFieldType Variant::infoType(string& key) {
                     return true;
             } else {
                 cerr << "not bool type " << key << endl;
+                exit(1);
             }
         }
     }
 
-    string Variant::getSampleValueString(string& key, string& sample, int index) {
+    string Variant::getSampleValueString(const string& key, string& sample, int index) {
         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
         if (s == vcf->infoTypes.end()) {
             cerr << "no info field " << key << endl;
@@ -656,11 +935,12 @@ VariantFieldType Variant::infoType(string& key) {
                 }
             } else {
                 cerr << "not string type " << key << endl;
+                exit(1);
             }
         }
     }
 
-    double Variant::getSampleValueFloat(string& key, string& sample, int index) {
+    double Variant::getSampleValueFloat(const string& key, string& sample, int index) {
         map<string, VariantFieldType>::iterator s = vcf->formatTypes.find(key);
         if (s == vcf->infoTypes.end()) {
             cerr << "no info field " << key << endl;
@@ -694,11 +974,12 @@ VariantFieldType Variant::infoType(string& key) {
                 return r;
             } else {
                 cerr << "unsupported type for sample " << type << endl;
+                exit(1);
             }
         }
     }
 
-    bool Variant::getValueBool(string& key, string& sample, int index) {
+    bool Variant::getValueBool(const string& key, string& sample, int index) {
         if (sample.empty()) { // an empty sample name means
             return getInfoValueBool(key, index);
         } else {
@@ -706,7 +987,7 @@ VariantFieldType Variant::infoType(string& key) {
         }
     }
 
-    double Variant::getValueFloat(string& key, string& sample, int index) {
+    double Variant::getValueFloat(const string& key, string& sample, int index) {
         if (sample.empty()) { // an empty sample name means
             return getInfoValueFloat(key, index);
         } else {
@@ -714,7 +995,7 @@ VariantFieldType Variant::infoType(string& key) {
         }
     }
 
-    string Variant::getValueString(string& key, string& sample, int index) {
+    string Variant::getValueString(const string& key, string& sample, int index) {
         if (sample.empty()) { // an empty sample name means
             return getInfoValueString(key, index);
         } else {
@@ -722,7 +1003,7 @@ VariantFieldType Variant::infoType(string& key) {
         }
     }
 
-    int Variant::getAltAlleleIndex(string& allele) {
+    int Variant::getAltAlleleIndex(const string& allele) {
         map<string, int>::iterator f = altAlleleIndexes.find(allele);
         if (f == altAlleleIndexes.end()) {
             cerr << "no such allele \'" << allele << "\' in record " << sequenceName << ":" << position << endl;
@@ -732,14 +1013,14 @@ VariantFieldType Variant::infoType(string& key) {
         }
     }
 
-    void Variant::addFilter(string& tag) {
+    void Variant::addFilter(const string& tag) {
         if (filter == "" || filter == ".")
             filter = tag;
         else
             filter += "," + tag;
     }
 
-    void Variant::addFormatField(string& key) {
+    void Variant::addFormatField(const string& key) {
         bool hasTag = false;
         for (vector<string>::iterator t = format.begin(); t != format.end(); ++t) {
             if (*t == key) {
@@ -1791,6 +2072,10 @@ map<string, vector<VariantAllele> > Variant::parsedAlternates(bool includePrevio
 
     map<string, vector<VariantAllele> > variantAlleles;
 
+    if (isSymbolicSV()){
+        // Don't ever align SVs. It just wrecks things.
+        return this->flatAlternates();
+    }
     // add the reference allele
     variantAlleles[ref].push_back(VariantAllele(ref, ref, position));
 
@@ -2035,7 +2320,7 @@ void Variant::updateAlleleIndexes(void) {
 }
 
 // TODO only works on "A"llele variant fields
-  void Variant::removeAlt(string& altAllele) {
+  void Variant::removeAlt(const string& altAllele) {
 
     int altIndex = getAltAlleleIndex(altAllele);  // this is the alt-relative index, 0-based
     
@@ -2431,7 +2716,7 @@ map<int, int> glReorder(int ploidy, int numalts, map<int, int>& alleleIndexMappi
     return mapping;
 }
 
-string Variant::getGenotype(string& sample) {
+string Variant::getGenotype(const string& sample) {
     map<string, map<string, vector<string> > >::iterator s = samples.find(sample);
     if (s != samples.end()) {
         map<string, vector<string> >::iterator f = s->second.find("GT");
@@ -2456,7 +2741,7 @@ bool Variant::isPhased(void) {
     return true;
 }
 
-long Variant::zeroBasedPosition(void) {
+long Variant::zeroBasedPosition(void) const {
     return position - 1;
 }
 


=====================================
src/Variant.h
=====================================
@@ -54,6 +54,12 @@ ostream& operator<<(ostream& out, VariantFieldType type);
 typedef map<string, map<string, vector<string> > > Samples;
 typedef vector<pair<int, string> > Cigar;
 
+/// Compute a reverse complement. Supports both upper-case and lower-case input characters.
+std::string reverse_complement(const std::string& seq);
+/// Convert a sequence to upper-case
+std::string toUpper(const std::string& seq);
+bool allATGCN(const string& s, bool allowLowerCase = true);
+
 class VariantCallFile {
 
 public:
@@ -193,7 +199,7 @@ public:
 
     string sequenceName;
     long position;
-    long zeroBasedPosition(void);
+    long zeroBasedPosition(void) const;
     string id;
     string ref;
     vector<string> alt;      // a list of all the alternate alleles present at this locus
@@ -201,6 +207,7 @@ public:
                              // the indicies are organized such that the genotype codes (0,1,2,.etc.)
                              // correspond to the correct offest into the allelese vector.
                              // that is, alleles[0] = ref, alleles[1] = first alternate allele, etc.
+
     string vrepr(void);  // a comparable record of the variantion described by the record
     set<string> altSet(void);  // set of alleles, rather than vector of them
     map<string, int> altAlleleIndexes;  // reverse lookup for alleles
@@ -219,14 +226,67 @@ public:
 
     map<string, string> extendedAlternates(long int newPosition, long int length);
 
-    // Convert a structural variant the canonical VCF4.2 format using a reference.
-    // returns true if the variant is canonicalized, false otherwise.
-    bool canonicalize_sv(FastaReference& ref, vector<FastaReference*> insertions, bool place_seq = false, int interval_sz = -1);
+    /** 
+     * Convert a structural variant to the canonical VCF4.3 format using a reference.
+     *   Meturns true if the variant is canonicalized, false otherwise.
+     *   May NOT be called twice on the same variant; it will fail an assert.
+     *   Returns false for non-SVs
+     *   place_seq: if true, the ref/alt fields are
+     *       filled in with the corresponding sequences
+     *     from the reference (and optionally insertion FASTA)
+     * min_size_override: If a variant is less than this size,
+     *     and it has a valid REF and ALT, consider it canonicalized
+     *     even if the below conditions are not true.
+     * Fully canonicalized variants (which are greater than min_size_override)
+     * guarantee the following:
+     *  - POS <= END and corresponds to the anchoring base for symbolic alleles
+     *  - SVLEN info field is set and is positive for all variants except DELs
+     *  - SVTYPE info field is set and is in {DEL, INS, INV, DUP}
+     *  - END info field is set to the POS + len(REF allele) - 1 and corresponds to the final affected reference base
+     *  - Insertions get an upper-case SEQ info field
+     *  - REF and ALT are upper-case if filled in by this function
+     *  - canonical = true;
+     * TODO: CURRENTLY: canonical requires there be only one alt allele
+    **/
+    bool canonicalize(FastaReference& ref,
+         vector<FastaReference*> insertions, 
+         bool place_seq = true, 
+         int min_size_override = 0);
+         
+    /** 
+     * Returns true if the variant's ALT contains a symbolic allele like <INV>
+     * instead of sequence, and the variant has an SVTYPE INFO tag.
+     */
+    bool isSymbolicSV() const;
     
-    pair<Variant, Variant> convert_to_breakends(FastaReference& ref);
-    int64_t get_sv_end(int pos);
-    int64_t get_sv_len(int pos);
-    bool is_sv();
+    /**
+     * Returns true if the variant has an SVTYPE INFO tag and either an SVLEN or END INFO tag.
+     */
+    bool hasSVTags() const;
+    
+    /**
+     * This returns true if the variant appears able to be handled by
+     * canonicalize(). It checks if it has fully specified sequence, or if it
+     * has a defined SV type and length/endpoint. 
+     */
+    bool canonicalizable();
+    
+    /**
+     * This gets set to true after canonicalize() has been called on the variant, if it succeeded.
+     */
+    bool canonical;
+    
+    /**
+     * Get the maximum zero-based position of the reference affected by this variant.
+     * Only works reliably for variants that are not SVs or for SVs that have been canonicalize()'d.
+     */
+    int getMaxReferencePos();
+   
+    /**
+     * Return the SV type of the given alt, or "" if there is no SV type set for that alt.
+     * This is the One True Way to get the SVTYPE of a variant; we should not touch the SVTYPE tag anywhere else.
+     */
+    string getSVTYPE(int altpos = 0) const;
 
     string originalLine; // the literal of the record, as read
     // TODO
@@ -234,10 +294,10 @@ public:
     // vector<pair<int, int> > genotypes;  // indexes into the alleles, ordered as per the spec
     string filter;
     double quality;
-    VariantFieldType infoType(string& key);
+    VariantFieldType infoType(const string& key);
     map<string, vector<string> > info;  // vector<string> allows for lists by Genotypes or Alternates
     map<string, bool> infoFlags;
-    VariantFieldType formatType(string& key);
+    VariantFieldType formatType(const string& key);
     vector<string> format;
     map<string, map<string, vector<string> > > samples;  // vector<string> allows for lists by Genotypes or Alternates
     vector<string> sampleNames;
@@ -248,7 +308,7 @@ public:
     //void addInfoFloat(string& tag, double value);
     //void addInfoString(string& tag, string& value);
 
-    void removeAlt(string& altallele);
+    void removeAlt(const string& altallele);
 
 public:
 
@@ -264,29 +324,29 @@ public:
     void setVariantCallFile(VariantCallFile* v);
 
     void parse(string& line, bool parseSamples = true);
-    void addFilter(string& tag);
-    bool getValueBool(string& key, string& sample, int index = INDEX_NONE);
-    double getValueFloat(string& key, string& sample, int index = INDEX_NONE);
-    string getValueString(string& key, string& sample, int index = INDEX_NONE);
-    bool getSampleValueBool(string& key, string& sample, int index = INDEX_NONE);
-    double getSampleValueFloat(string& key, string& sample, int index = INDEX_NONE);
-    string getSampleValueString(string& key, string& sample, int index = INDEX_NONE);
-    bool getInfoValueBool(string& key, int index = INDEX_NONE);
-    double getInfoValueFloat(string& key, int index = INDEX_NONE);
-    string getInfoValueString(string& key, int index = INDEX_NONE);
+    void addFilter(const string& tag);
+    bool getValueBool(const string& key, string& sample, int index = INDEX_NONE);
+    double getValueFloat(const string& key, string& sample, int index = INDEX_NONE);
+    string getValueString(const string& key, string& sample, int index = INDEX_NONE);
+    bool getSampleValueBool(const string& key, string& sample, int index = INDEX_NONE);
+    double getSampleValueFloat(const string& key, string& sample, int index = INDEX_NONE);
+    string getSampleValueString(const string& key, string& sample, int index = INDEX_NONE);
+    bool getInfoValueBool(const string& key, int index = INDEX_NONE);
+    double getInfoValueFloat(const string& key, int index = INDEX_NONE);
+    string getInfoValueString(const string& key, int index = INDEX_NONE);
     void printAlt(ostream& out);      // print a comma-sep list of alternate alleles to an ostream
     void printAlleles(ostream& out);  // print a comma-sep list of *all* alleles to an ostream
-    int getAltAlleleIndex(string& allele);
+    int getAltAlleleIndex(const string& allele);
     void updateAlleleIndexes(void);
-    void addFormatField(string& key);
+    void addFormatField(const string& key);
     void setOutputSampleNames(vector<string>& outputSamples);
     map<pair<int, int>, int> getGenotypeIndexesDiploid(void);
     int getNumSamples(void);
     int getNumValidGenotypes(void);
-    string getGenotype(string& sample);
+    string getGenotype(const string& sample);
     bool isPhased(void);
     // TODO
-    //void setInfoField(string& key, string& val);
+    //void setInfoField(const string& key, string& val);
 
 private:
 


=====================================
src/cdflib.cpp
=====================================
@@ -10040,7 +10040,7 @@ void negative_binomial_cdf_values ( int *n_data, int *f, int *s, double *p,
     1, 2, 3,
     0, 1, 2 };
 
-  if ( n_data < 0 )
+  if ( *n_data < 0 )
   {
     *n_data = 0;
   }


=====================================
src/split.h
=====================================
@@ -47,7 +47,7 @@ void tokenize(const std::string& str, ContainerT& tokens,
 
 	lastPos = pos + 1;
     }
-};
+}
 
 
 #endif


=====================================
src/vcfintersect.cpp
=====================================
@@ -290,9 +290,9 @@ int main(int argc, char** argv) {
     // read the VCF file for union or intersection into an interval tree
     // indexed using some proximity window
 
-    map<string, IntervalTree<Variant*> > variantIntervals;
+    map<string, IntervalTree<size_t, Variant*> > variantIntervals;
     map<string, list<Variant> > otherVariants;
-    map<string, vector<Interval<Variant*> > > otherVariantIntervals;
+    map<string, vector<Interval<size_t, Variant*> > > otherVariantIntervals;
 
     if (unioning || intersecting) {
 
@@ -302,11 +302,10 @@ int main(int argc, char** argv) {
             long int right = left + ovar.ref.size(); // this should be 1-past the end
             otherVariants[ovar.sequenceName].push_back(ovar);
             Variant* v = &otherVariants[ovar.sequenceName].back();
-            otherVariantIntervals[ovar.sequenceName].push_back(Interval<Variant*>(left, right, v));
+            otherVariantIntervals[ovar.sequenceName].push_back(Interval<size_t, Variant*>(left, right, v));
         }
-	
-        for (map<string, vector<Interval<Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) {
-            variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
+        for (map<string, vector<Interval<size_t, Variant*> > >::iterator j = otherVariantIntervals.begin(); j != otherVariantIntervals.end(); ++j) {
+            variantIntervals[j->first] = IntervalTree<size_t, Variant*>((vector<Interval<size_t, Variant*> >&&)j->second);
         }
 
     }
@@ -323,10 +322,9 @@ int main(int argc, char** argv) {
             lastSequenceName = var.sequenceName;
         } else if (lastSequenceName != var.sequenceName) {
             if (unioning) {
-                vector<Interval<Variant*> > previousRecords;
                 long int lastSeqLength = reference.sequenceLength(lastSequenceName);
-                variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength, previousRecords);
-                for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
+                vector<Interval<size_t, Variant*> > previousRecords = variantIntervals[lastSequenceName].findContained(lastOutputPosition, lastSeqLength);
+                for (vector<Interval<size_t, Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
                     Variant* v = r->value;
                     if (outputVariants.find(v) == outputVariants.end()) {
                         outputVariants.insert(v);
@@ -362,13 +360,12 @@ int main(int argc, char** argv) {
             // hmm... for unioning, you might need to step through the original VCF records
             // but the idea is to exclude the haplotype-based duplicates
 
-            vector<Interval<Variant*> > results;
-
-            variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);
+            vector<Interval<size_t, Variant*> > results
+                = variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize);
 
             vector<Variant*> overlapping;
 
-            for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
+            for (vector<Interval<size_t, Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
                 overlapping.push_back(r->value);
             }
 
@@ -381,13 +378,12 @@ int main(int argc, char** argv) {
                 // between the last one printed out and the first
                 // one we're about to print out
 
-                vector<Interval<Variant*> > previousRecords;
-
-                variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize, previousRecords);
+                vector<Interval<size_t, Variant*> > previousRecords
+                    = variantIntervals[var.sequenceName].findOverlapping(lastOutputPosition, var.position - windowsize);
 
                 map<long int, vector<Variant*> > variants;
 
-                for (vector<Interval<Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
+                for (vector<Interval<size_t, Variant*> >::iterator r = previousRecords.begin(); r != previousRecords.end(); ++r) {
                     Variant* v = r->value;
                     if (outputVariants.find(v) == outputVariants.end()) {
                         outputVariants.insert(v);


=====================================
src/vcfnormalizesvs.cpp
=====================================
@@ -19,7 +19,7 @@ int main(int argc, char** argv) {
     
     string ref_file = "";
     vector<string> insertion_files;
-    int max_interval = -1;
+    int min_size = 0;
     bool replace_sequences = true;
 
     int c = 0;
@@ -92,7 +92,7 @@ int main(int argc, char** argv) {
 
     Variant var;
     while (variantFile.getNextVariant(var)) {
-        bool valid = var.canonicalize_sv(ref, insertions, replace_sequences, max_interval);
+        bool valid = var.canonicalize(ref, insertions, replace_sequences, min_size);
         if (!valid){
             cerr << "Variant could not be normalized" << var << endl;
         }


=====================================
src/vcfroc.cpp
=====================================
@@ -26,39 +26,38 @@ void printSummary(char** argv) {
 }
 
 void buildVariantIntervalTree(VariantCallFile& variantFile,
-                              map<string, IntervalTree<Variant*> >& variantIntervals,
+                              map<string, IntervalTree<size_t, Variant*> >& variantIntervals,
                               list<Variant>& variants) {
 
-    map<string, vector<Interval<Variant*> > > rawVariantIntervals;
+    map<string, vector<Interval<size_t, Variant*> > > rawVariantIntervals;
     Variant var(variantFile);
     while (variantFile.getNextVariant(var)) {
         long int left = var.position;
         long int right = left + var.ref.size(); // this should be 1-past the end
         variants.push_back(var);
         Variant* v = &variants.back();
-        rawVariantIntervals[var.sequenceName].push_back(Interval<Variant*>(left, right, v));
+        rawVariantIntervals[var.sequenceName].push_back(Interval<size_t, Variant*>(left, right, v));
     }
 	
-    for (map<string, vector<Interval<Variant*> > >::iterator j = rawVariantIntervals.begin(); j != rawVariantIntervals.end(); ++j) {
-        variantIntervals[j->first] = IntervalTree<Variant*>(j->second);
+    for (map<string, vector<Interval<size_t, Variant*> > >::iterator j = rawVariantIntervals.begin(); j != rawVariantIntervals.end(); ++j) {
+        variantIntervals[j->first] = IntervalTree<size_t, Variant*>((vector<Interval<size_t, Variant*> >&&)j->second);
     }
 }
 
 
 void intersectVariant(Variant& var,
-                      map<string, IntervalTree<Variant*> >& variantIntervals,
+                      map<string, IntervalTree<size_t, Variant*> >& variantIntervals,
                       vector<string*>& commonAlleles,
                       vector<string*>& uniqueAlleles,
                       FastaReference& reference,
                       int windowsize = 50) {
 
-    vector<Interval<Variant*> > results;
-
-    variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize, results);
+    vector<Interval<size_t, Variant*> > results
+        = variantIntervals[var.sequenceName].findContained(var.position - windowsize, var.position + var.ref.size() + windowsize);
 
     vector<Variant*> overlapping;
 
-    for (vector<Interval<Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
+    for (vector<Interval<size_t, Variant*> >::iterator r = results.begin(); r != results.end(); ++r) {
         overlapping.push_back(r->value);
     }
 
@@ -223,11 +222,11 @@ int main(int argc, char** argv) {
     // read the VCF file for union or intersection into an interval tree
     // indexed using some proximity window
 
-    map<string, IntervalTree<Variant*> > truthVariantIntervals;
+    map<string, IntervalTree<size_t, Variant*> > truthVariantIntervals;
     list<Variant> truthVariants;
     buildVariantIntervalTree(truthVariantFile, truthVariantIntervals, truthVariants);
 
-    map<string, IntervalTree<Variant*> > testVariantIntervals;
+    map<string, IntervalTree<size_t, Variant*> > testVariantIntervals;
     list<Variant> testVariants;
     buildVariantIntervalTree(variantFile, testVariantIntervals, testVariants);
 


=====================================
test/Makefile
=====================================
@@ -20,4 +20,4 @@ tests/main: ../googletest/googletest/make/gtest_main.a
 run: tests/main
 	./tests/main
 clean:
-	rm tests/main
\ No newline at end of file
+	rm -f tests/main


=====================================
test/tests/main deleted
=====================================
Binary files a/test/tests/main and /dev/null differ



View it on GitLab: https://salsa.debian.org/med-team/libvcflib/commit/18f9a56680d84676eadf8905cf2bd95b76fc9841

-- 
View it on GitLab: https://salsa.debian.org/med-team/libvcflib/commit/18f9a56680d84676eadf8905cf2bd95b76fc9841
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20190904/8e738ca9/attachment-0001.html>


More information about the debian-med-commit mailing list