[med-svn] [Git][med-team/stacks][upstream] New upstream version 2.65+dfsg

Lance Lin (@linqigang) gitlab at salsa.debian.org
Tue Aug 29 17:05:32 BST 2023



Lance Lin pushed to branch upstream at Debian Med / stacks


Commits:
eaf87a65 by Lance Lin at 2023-08-29T20:52:57+07:00
New upstream version 2.65+dfsg
- - - - -


23 changed files:

- ChangeLog
- configure.ac
- scripts/denovo_map.pl
- scripts/stacks-integrate-alignments
- src/PopSum.cc
- src/PopSum.h
- src/Seq.h
- src/Vcf.cc
- src/Vcf.h
- src/clean.h
- src/constants.h
- src/cstacks.cc
- src/cstacks.h
- src/export_formats.cc
- src/populations.cc
- src/populations.h
- src/process_radtags.cc
- src/renz.cc
- src/sql_utilities.cc
- src/sql_utilities.h
- src/sstacks.cc
- src/tsv2bam.cc
- src/ustacks.cc


Changes:

=====================================
ChangeLog
=====================================
@@ -1,3 +1,36 @@
+Stacks 2.65 - August 18, 2023
+-----------------------------
+    Feature: Added a "properly paired" reads counter to process_radtags.
+    Feature: extended populations filtering parameters to apply to fixed nucleotide sites, this
+             is applied in exports such as --vcf-all.
+    Feature: denovo_map.pl now accepts a second population map if you have a large data set and
+             would like to only load a subset of samples into the catalog.
+    Feature: Updated ustacks, cstacks, sstacks, and tsv2bam to no longer require
+             external sample ID numbers (-i option to ustacks). The pipeline will now
+             internally generate IDs when necessary, but most parts of the pipeline do
+             not need them any longer.
+    Feature: If a VCF input file for populations contains contig definitions
+             for a reference genome, those contigs will now be properly exported in any
+             VCF exports.
+    Feature: Added HaeII restriction enzyme.
+
+Stacks 2.64 - March 5, 2023
+---------------------------
+    Bugfix: the VCF export from populations could contain illegal fields for samples that have a
+            genotype call but effectively have no reads (reads contain unknown alleles or Ns in that
+	    position). This could throw errors if a user then tried to import the VCF to popoulations
+	    using --in-vcf.
+    Bugfix: private alleles. In the case of having two populations, where a particular site was differentally
+            fixed between the populations, both alleles would not be marked as private (neither would).
+            Now both alleles are marked as privte.
+
+Stacks 2.63 - October 23, 2022
+------------------------------
+    Feature: added AslI restriction enzyme.
+    Bugfix: fixed error in stacks-integrate-alns script which could cause a
+            failure when processing single-end data ('contig' is not defined in
+            catalog.fa.gz for single-end data).
+
 Stacks 2.62 - June 28, 2022
 ---------------------------
     Feature: added a '--vcf-all' export to populations which will export fixed


=====================================
configure.ac
=====================================
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.59)
-AC_INIT([Stacks], [2.62])
+AC_INIT([Stacks], [2.65])
 AC_CONFIG_AUX_DIR([config])
 AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
 AC_CONFIG_SRCDIR([src/ustacks.cc])


=====================================
scripts/denovo_map.pl
=====================================
@@ -1,6 +1,6 @@
 #!/usr/bin/env perl
 #
-# Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+# Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 #
 # This file is part of Stacks.
 #
@@ -30,6 +30,7 @@ use constant false => 0;
 my $dry_run      = false;
 my $exe_path     = "_BINDIR_";
 my $out_path     = "";
+my $cpopmap_path = "";
 my $popmap_path  = "";
 my $sample_path  = "";
 my $db           = "";
@@ -64,6 +65,7 @@ my ($log, $log_fh, $sample);
 my (@sample_list, %pop_ids, %pops, %grp_ids, %grps, %sample_ids);
 
 parse_population_map(\@sample_list, \%pop_ids, \%pops, \%grp_ids, \%grps, \$genetic_map);
+parse_catalog_population_map(\@sample_list, \%pop_ids);
 
 initialize_samples(\@parents, \@progeny, \@samples, \@sample_list, \%pop_ids, \%grp_ids);
 
@@ -161,7 +163,7 @@ sub execute_stacks {
             next;
         }
 
-        $cmd = $exe_path . "ustacks -t $sample->{'fmt'} -f $sample->{'path'} -o $out_path -i $sample_id" . $minc;
+        $cmd = $exe_path . "ustacks -t $sample->{'fmt'} -f $sample->{'path'} -o $out_path" . $minc;
 
         if ($sample->{'path'} !~ /$sample->{'file'}.$sample->{'suffix'}$/) {
             # Guessing the sample name from the input path won't work.
@@ -290,7 +292,7 @@ sub execute_stacks {
     }
 
     #
-    # Sort the reads according by catalog locus / run tsv2bam.
+    # Sort the reads according to catalog locus / run tsv2bam.
     #
     print STDERR "Sorting reads by RAD locus...\n";
     print $log_fh "\ntsv2bam\n==========\n";
@@ -586,6 +588,44 @@ sub parse_population_map {
     close($fh);
 }
 
+sub parse_catalog_population_map {
+    my ($sample_list, $pop_ids) = @_;
+
+    my ($fh, @parts, $line, $sample);
+
+    return if (length($cpopmap_path) == 0);
+    return if ($genetic_map == true);
+    
+    my $sample_cnt = 0;
+    
+    open($fh, "<$cpopmap_path") or die("Unable to open catalog population map, '$cpopmap_path', $!\n");
+
+    while ($line = <$fh>) {
+        chomp $line;
+        next if ($line =~ /^\s*#/);
+
+        @parts = split(/\t/, $line);
+        if (scalar(@parts) != 2 and scalar(@parts) != 3) {
+            die("Unable to parse population map, '$cpopmap_path' (expected 2 or 3 columns, found " . scalar(@parts) . "); at line:\n$line\n");
+        }
+
+        foreach my $part (@parts) {
+            $part =~ s/^\s*|\s*$//g;
+        }
+
+	if (!defined($pop_ids->{$parts[0]})) {
+	    print STDERR "Unable to find '$parts[0]' in '$popmap_path'. Samples listed in the catalog population map should also be present in the main population map.\n";
+	    exit(1);
+	} else {
+	    $sample_cnt++;
+	}
+    }
+
+    print STDERR "    Parsed catalog population map: $sample_cnt of those ", scalar(@{$sample_list}), " files will be loaded into the catalog.\n";
+
+    close($fh);
+}
+
 sub initialize_samples {
     my ($parents, $progeny, $samples, $sample_list, $pop_ids, $grp_ids) = @_;
 
@@ -764,15 +804,9 @@ sub parse_command_line {
         elsif ($_ =~ /^--resume$/)  { $resume      = true; }
         elsif ($_ =~ /^--paired$/)  { $paired      = true; }
         elsif ($_ =~ /^--samples$/) { $sample_path = shift @ARGV; }
-        elsif ($_ =~ /^-O$/ || $_ =~ /^--popmap$/) {
-            $popmap_path = shift @ARGV;
-            push(@_cstacks,     "-M " . $popmap_path);
-            push(@_sstacks,     "-M " . $popmap_path);
-            push(@_tsv2bam,     "-M " . $popmap_path);
-            push(@_gstacks,     "-M " . $popmap_path);
-            push(@_populations, "-M " . $popmap_path);
-
-        } elsif ($_ =~ /^-T$/ || $_ =~ /^--threads/) {
+        elsif ($_ =~ /^-O$/ || $_ =~ /^--popmap$/) { $popmap_path = shift @ARGV; }
+	elsif ($_ =~ /^--catalog-popmap$/) { $cpopmap_path = shift @ARGV; }
+	elsif ($_ =~ /^-T$/ || $_ =~ /^--threads/) {
             $arg = shift @ARGV;
             push(@_ustacks, "-p " . $arg);
             push(@_cstacks, "-p " . $arg);
@@ -848,6 +882,18 @@ sub parse_command_line {
     if (length($popmap_path) == 0) {
         print STDERR "You must specify a population map that lists your sample names (--popmap).\n";
         usage();
+    } elsif (length($cpopmap_path) > 0) {
+	push(@_cstacks,     "-M " . $cpopmap_path);
+	push(@_sstacks,     "-M " . $popmap_path);
+	push(@_tsv2bam,     "-M " . $popmap_path);
+	push(@_gstacks,     "-M " . $popmap_path);
+	push(@_populations, "-M " . $popmap_path);
+    } else {
+	push(@_cstacks,     "-M " . $popmap_path);
+	push(@_sstacks,     "-M " . $popmap_path);
+	push(@_tsv2bam,     "-M " . $popmap_path);
+	push(@_gstacks,     "-M " . $popmap_path);
+	push(@_populations, "-M " . $popmap_path);
     }
 
     if (length($sample_path) == 0) {
@@ -882,7 +928,7 @@ sub usage {
 denovo_map.pl --samples dir --popmap path --out-path dir [--paired [--rm-pcr-duplicates]] (assembly options) (filtering options) [-X prog:"opts" ...]
 
   Input/Output files:
-    --samples: path to the directory containing the samples reads files.
+    --samples: path to the directory containing the reads files for each sample.
     --popmap: path to a population map file (format is "<name> TAB <pop>", one sample per line).
     -o,--out-path: path to an output directory.
 
@@ -908,7 +954,10 @@ denovo_map.pl --samples dir --popmap path --out-path dir [--paired [--rm-pcr-dup
   Population filtering options:
     -r,--min-samples-per-pop: minimum percentage of individuals in a population required to process a locus for that population (for populations; default: 0)
     -p,--min-populations: minimum number of populations a locus must be present in to process a locus (for populations; default: 1)
-    
+
+  For large datasets:
+    --catalog-popmap: path to a second population map file containing a subset of samples for use only in building the catalog.
+
   Miscellaneous:
     --time-components (for benchmarking)
 EOQ


=====================================
scripts/stacks-integrate-alignments
=====================================
@@ -452,6 +452,7 @@ def write_catalog(loci, chrs, datestamp, new_locus_id, ordered_loci):
             parts  = line.split(' ')
             loc_id = int(parts[0][1:])
             ns     = parts[1]
+            contig = ""
             if len(parts) > 2:
                 contig = parts[2]
             line   = ""


=====================================
src/PopSum.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2011-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -106,13 +106,13 @@ LocPopSum::tally_fixed_pos(const CSLocus *cloc, Datum const*const* d, LocSum *s,
     s->nucs[pos].reset();
 
     for (uint i = start; i <= end; i++) {
-        if (d[i] == NULL || pos >= d[i]->len)
+        if (d[i] == NULL || pos >= d[i]->len || d[i]->model[pos] == 'U')
             continue;
         //
         // Before counting this individual, make sure the model definitively called this
-        // position as hEterozygous or hOmozygous.
+        // position as hOmozygous.
         //
-        assert(d[i]->model[pos] != 'E');
+        assert(d[i]->model[pos] == 'O');
         num_indv++;
         p_nuc = cloc->con[pos];
     }
@@ -471,7 +471,7 @@ LocPopSum::log_hwp_pr(double n_fac, double p_fac, double q_fac, double pp, doubl
 int
 LocPopSum::tally_metapop(const CSLocus *cloc)
 {
-    int       variable_pop;
+    int16_t   variable_pop;
     uint16_t  p_cnt, q_cnt, col;
     LocSum  **s  = this->_per_pop;
     LocTally *mp = this->_meta_pop;
@@ -553,7 +553,16 @@ LocPopSum::tally_metapop(const CSLocus *cloc)
                 if (s[j]->nucs[col].p_nuc() == mp->nucs[col].q_allele ||
                     s[j]->nucs[col].q_nuc() == mp->nucs[col].q_allele)
                     variable_pop = j;
-        }
+        } else if (p_cnt == 1 && q_cnt == 1) {
+	    //
+	    // This is a special case where there are only two populations
+	    // with fixed, different alleles. Here both alleles are private
+	    // but we can't store both populations, so we will store the max
+	    // integer value instead.
+	    //
+	    if (mp->nucs[col].pop_cnt == 2)
+		variable_pop = two_priv_alleles;
+	}
         mp->nucs[col].priv_allele = variable_pop;
     }
 
@@ -566,13 +575,17 @@ LocPopSum::tally_ref_alleles(int snp_index, uint16_t &allele_cnt,
                              uint16_t &p_cnt, uint16_t &q_cnt)
 {
     int  nucs[4] = {0};
-    char nuc[2];
+    char nuc[2]  = {0};
 
     p_allele   = 0;
     q_allele   = 0;
     allele_cnt = 0;
 
     for (uint j = 0; j < this->_pop_cnt; j++) {
+
+	if (this->_per_pop[j]->nucs[snp_index].num_indv() == 0)
+	    continue;
+	
         nuc[0] = 0;
         nuc[1] = 0;
         nuc[0] = this->_per_pop[j]->nucs[snp_index].p_nuc();
@@ -614,6 +627,12 @@ LocPopSum::tally_ref_alleles(int snp_index, uint16_t &allele_cnt,
         return 0;
     }
 
+    if (allele_cnt == 0) {
+	p_allele = 0;
+        q_allele = 0;
+        return 1;
+    }
+    
     //
     // Record which nucleotide is the P allele and which is the Q allele.
     //
@@ -661,6 +680,10 @@ LocPopSum::tally_ref_alleles(int snp_index, uint16_t &allele_cnt,
     q_cnt = 0;
 
     for (uint j = 0; j < this->_pop_cnt; j++) {
+
+	if (this->_per_pop[j]->nucs[snp_index].num_indv() == 0)
+	    continue;
+	
         nuc[0] = 0;
         nuc[1] = 0;
         nuc[0] = this->_per_pop[j]->nucs[snp_index].p_nuc();
@@ -765,9 +788,11 @@ LocPopSum::haplotype_diversity(int start, int end, Datum const*const* d)
     //
     // If this haplotype is fixed, don't calculate any statistics.
     //
-    if (n == 0)
+    if (n == 0) {
+	delete lstat;
         return NULL;
-
+    }
+    
     //
     // Store a summary of the haplotype counts to output below.
     //


=====================================
src/PopSum.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2011-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //


=====================================
src/Seq.h
=====================================
@@ -144,7 +144,7 @@ inline
 void Seq::reserve(size_t n, bool with_qual) {
     if (capacity < long(n)) {
         capacity = n;
-        delete seq;
+        delete [] seq;
         seq = new char[capacity + 1];
         *seq = '\0';
         if (with_qual) {


=====================================
src/Vcf.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -372,6 +372,7 @@ VcfParser::read_header()
             // Skip header lines missing the '='.
             continue;
         }
+
         header_.add_meta(VcfMeta(string(line+2, equal), string(equal+1)));
     }
     if (!not_eof)


=====================================
src/Vcf.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //


=====================================
src/clean.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2011, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at uoregon.edu>
 //
 // This file is part of Stacks.
 //
@@ -396,6 +396,7 @@ process_barcode(RawRead *href_1, RawRead *href_2, BarcodePair &bc,
         barcode_log[bc]["total"]    = 0;
         barcode_log[bc]["low_qual"] = 0;
         barcode_log[bc]["retained"] = 0;
+	barcode_log[bc]["proppair"] = 0;
     }
     barcode_log[bc]["total"] += paired ? 2 : 1;
 


=====================================
src/constants.h
=====================================
@@ -159,6 +159,12 @@ const unsigned int max_samples = max_alleles / 2;
 //
 const double FSNC = -7.0;
 
+//
+// A flag to store the special case when we are searching for private alleles
+// and find two, alternative private alleles at the same locus.
+//
+const int16_t two_priv_alleles = -2;
+
 //
 // Supported file types
 //


=====================================
src/cstacks.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -42,6 +42,7 @@ double  min_match_len     = 0.80;
 double  max_gaps          = 2.0;
 
 size_t  next_catalog_id   = 1;
+size_t  next_sample_id    = 1;
 
 using namespace std;
 
@@ -71,14 +72,13 @@ int main (int argc, char* argv[]) {
 
     map<int, CLocus *> catalog;
     pair<int, string>  s;
-    bool compressed = false;
-    int  i;
+    int i;
 
-    set<int> seen_sample_ids; // For checking sample ID unicity.
+    map<string, size_t> seen_samples; // For checking sample ID unicity.
 
     if (catalog_path.length() > 0) {
         cerr << "\nInitializing existing catalog...\n";
-        if (!initialize_existing_catalog(catalog_path, catalog)) {
+        if (!initialize_existing_catalog(catalog_path, catalog, seen_samples, next_sample_id)) {
             cerr << "Error: Failed to initialize the catalog.\n";
             throw exception();
         }
@@ -89,12 +89,11 @@ int main (int argc, char* argv[]) {
         samples.pop();
 
         cerr << "\nInitializing new catalog...\n";
-        if (!initialize_new_catalog(s, catalog)) {
+        if (!initialize_new_catalog(s, catalog, seen_samples, next_sample_id)) {
             cerr << "Error: Failed to initialize the catalog.\n";
             throw exception();
         }
         i = 2;
-        seen_sample_ids.insert(catalog.begin()->second->sample_id);
     }
 
     while (!samples.empty()) {
@@ -105,6 +104,7 @@ int main (int argc, char* argv[]) {
 
         cerr << "\nProcessing sample " << s.second << " [" << i << " of " << sample_cnt << "]\n";
 
+	bool compressed = false;
         if (!load_loci(s.second, sample, 0, false, compressed)) {
             cerr << "Failed to load sample " << i << "\n";
             continue;
@@ -112,15 +112,18 @@ int main (int argc, char* argv[]) {
         //
         // Assign the ID for this sample data.
         //
-        s.first = sample.begin()->second->sample_id;
+        s.first = next_sample_id;
 
-        // Check for unique sample IDs.
-        if (!seen_sample_ids.insert(s.first).second) {
+        // Check for unique sample names.
+        if (seen_samples.count(s.second) > 0) {
             // Insert failed.
-            cerr << "Error: Sample ID '" << s.first << "' occurs more than once. Sample IDs must be unique." << endl;
+            cerr << "Error: Sample '" << s.second << "' is already inserted into this catalog. Samples must be unique." << endl;
             throw exception();
-        }
-
+	} else {
+	    seen_samples[s.second] = next_sample_id;
+	}
+	next_sample_id++;
+	
         //dump_loci(sample);
 
         cerr << "Searching for sequence matches...\n";
@@ -163,7 +166,7 @@ int main (int argc, char* argv[]) {
     cerr << "\nWriting catalog in directory '" << out_path << "'.\n"
          << "Final catalog contains " << catalog.size() << " loci.\n";
 
-    write_catalog(catalog);
+    write_catalog(catalog, seen_samples);
     cerr << "cstacks is done.\n";
 
     //
@@ -177,27 +180,6 @@ int main (int argc, char* argv[]) {
     IF_NDEBUG_CATCH_ALL_EXCEPTIONS
 }
 
-int update_catalog_index(map<int, CLocus *> &catalog, map<string, int> &cat_index) {
-    map<int, CLocus *>::iterator j;
-    char id[id_len];
-
-    for (j = catalog.begin(); j != catalog.end(); j++) {
-        snprintf(id, id_len - 1, "%s|%d|%c",
-                 j->second->loc.chr(),
-                 j->second->loc.bp,
-                 j->second->loc.strand == strand_plus ? '+' : '-');
-
-        if (cat_index.count(id) == 0) {
-            cat_index[id] = j->first;
-        } else {
-            if (cat_index[id] != j->first)
-                cerr << "Error: Catalog index mismatch, key: '" << id << "'.\n";
-        }
-    }
-
-    return 0;
-}
-
 int
 characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
 {
@@ -1042,8 +1024,9 @@ int find_matches_by_genomic_loc(map<string, int> &cat_index, map<int, QLocus *>
     return 0;
 }
 
-int write_catalog(map<int, CLocus *> &catalog) {
-
+int
+write_catalog(map<int, CLocus *> &catalog, map<string, size_t> &seen_samples)
+{
     map<int, CLocus *>::iterator i;
     CLocus  *tag;
     set<int> matches;
@@ -1053,18 +1036,24 @@ int write_catalog(map<int, CLocus *> &catalog) {
     string tag_file = out_path + "catalog.tags.tsv";
     string snp_file = out_path + "catalog.snps.tsv";
     string all_file = out_path + "catalog.alleles.tsv";
-
+    string sam_file = out_path + "catalog.sample_list.tsv";
+    
     if (gzip) {
         tag_file += ".gz";
         snp_file += ".gz";
         all_file += ".gz";
+	sam_file += ".gz";
     }
 
+    //
+    // Write the sample list.
+    //
+    
     //
     // Open the output files for writing.
     //
-    gzFile   gz_tags, gz_snps, gz_alle;
-    ofstream tags, snps, alle;
+    gzFile   gz_tags, gz_snps, gz_alle, gz_sams;
+    ofstream tags, snps, alle, sams;
     if (gzip) {
         gz_tags = gzopen(tag_file.c_str(), "wb");
         if (!gz_tags) {
@@ -1090,13 +1079,24 @@ int write_catalog(map<int, CLocus *> &catalog) {
         #if ZLIB_VERNUM >= 0x1240
         gzbuffer(gz_alle, libz_buffer_size);
         #endif
+
+	gz_sams = gzopen(sam_file.c_str(), "wb");
+        if (!gz_sams) {
+            cerr << "Error: Unable to open gzipped catalog sample list file '" << sam_file << "': " << strerror(errno) << ".\n";
+            exit(1);
+        }
+        #if ZLIB_VERNUM >= 0x1240
+        gzbuffer(gz_sams, libz_buffer_size);
+        #endif
     } else {
         tags.open(tag_file.c_str());
         snps.open(snp_file.c_str());
         alle.open(all_file.c_str());
+	sams.open(sam_file.c_str());
         check_open(tags, tag_file);
         check_open(snps, snp_file);
         check_open(alle, all_file);
+	check_open(sams, sam_file);
     }
 
     //
@@ -1116,10 +1116,12 @@ int write_catalog(map<int, CLocus *> &catalog) {
         gzputs_throwing(gz_tags, log.str().c_str());
         gzputs_throwing(gz_snps, log.str().c_str());
         gzputs_throwing(gz_alle, log.str().c_str());
+	gzputs_throwing(gz_sams, log.str().c_str());
     } else {
         tags << log.str();
         snps << log.str();
         alle << log.str();
+	sams << log.str();
     }
 
     for (i = catalog.begin(); i != catalog.end(); i++) {
@@ -1131,6 +1133,31 @@ int write_catalog(map<int, CLocus *> &catalog) {
             write_simple_output(tag, tags, snps, alle);
     }
 
+    //
+    // Write the list of samples incorporated into the catalog, sorted by sample ID.
+    //
+    vector<pair<string, size_t>> sorted_seen_samples;
+    for (auto it = seen_samples.begin(); it != seen_samples.end(); it++) {
+	string sample_path = it->first;
+	// Remove the file path off the sample string.
+	int pos = sample_path.find_last_of("/");
+	sample_path = sample_path.substr(pos+1);
+	sorted_seen_samples.push_back(make_pair(sample_path, it->second));
+    }
+    sort(sorted_seen_samples.begin(), sorted_seen_samples.end(),
+	 [] (pair<string, size_t> i, pair<string, size_t> j) { return (i.second < j.second); });
+
+    for (auto it = sorted_seen_samples.begin(); it != sorted_seen_samples.end(); it++) {
+	log.str("");
+	log << it->first << "\t" << it->second << "\n";
+
+	if (gzip) {
+	    gzputs_throwing(gz_sams, log.str().c_str());
+	} else {
+	    sams << log.str();
+	}
+    }	
+    
     time(&rawtime);
     timeinfo = localtime(&rawtime);
     strftime(date, 32, "%F %T", timeinfo);
@@ -1150,10 +1177,12 @@ int write_catalog(map<int, CLocus *> &catalog) {
         gzclose_throwing(gz_tags);
         gzclose_throwing(gz_snps);
         gzclose_throwing(gz_alle);
+	gzclose_throwing(gz_sams);
     } else {
         tags.close();
         snps.close();
         alle.close();
+	sams.close();
     }
 
     return 0;
@@ -1613,8 +1642,7 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
     }
     sources = sources.substr(0, sources.length() - 1);
 
-    cat_file << 0           << "\t" // Catalog has no sample ID.
-             << tag->id     << "\t"
+    cat_file << tag->id     << "\t"
              << "consensus" << "\t"
              << "0"         << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
              << sources     << "\t"
@@ -1627,8 +1655,7 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
     // Output the SNPs associated with the catalog tag
     //
     for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
-        snp_file << "0"            << "\t"
-                 << tag->id        << "\t"
+	snp_file << tag->id        << "\t"
                  << (*snp_it)->col << "\t";
 
         switch((*snp_it)->type) {
@@ -1655,8 +1682,7 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
     // Output the alleles associated with the two matched tags
     //
     for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
-        all_file << "0"           << "\t"
-                 << tag->id       << "\t"
+	all_file << tag->id       << "\t"
                  << all_it->first << "\t"
                  << "0"           << "\t"    // These two fields are used in the
                  << "0"           << "\n";   // ustacks/pstacks output, not in cstacks.
@@ -1678,8 +1704,7 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
 
     sstr.str("");
 
-    sstr << 0           << "\t" // Catalog has no sample ID.
-         << tag->id     << "\t"
+    sstr << tag->id     << "\t"
          << "consensus" << "\t"
          << "0"         << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
          << sources     << "\t"
@@ -1695,8 +1720,7 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
     // Output the SNPs associated with the catalog tag
     //
     for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
-        sstr << "0"            << "\t"
-             <<   tag->id      << "\t"
+	sstr <<   tag->id      << "\t"
              << (*snp_it)->col << "\t";
 
         switch((*snp_it)->type) {
@@ -1726,8 +1750,7 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
     // Output the alleles associated with the two matched tags
     //
     for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
-        sstr << "0"     << "\t"
-             << tag->id << "\t"
+	sstr << tag->id << "\t"
              << all_it->first << "\t"
              << 0       << "\t"
              << 0       << "\n";
@@ -1738,7 +1761,8 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
 }
 
 int
-initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
+initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog,
+		       map<string, size_t> &seen_samples, size_t &next_sample_id)
 {
     map<int, CLocus *> tmp_catalog;
     bool compressed = false;
@@ -1751,7 +1775,7 @@ initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
 
     in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
 
-    sample.first = tmp_catalog.begin()->second->sample_id;
+    sample.first = next_sample_id;
 
     //
     // Iterate over the catalog entires and renumber them after recording the source of
@@ -1768,11 +1792,15 @@ initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
 
     cerr << "  " << catalog.size() << " loci were newly added to the catalog.\n";
 
+    seen_samples[sample.second] = next_sample_id;
+    next_sample_id++;
+
     return 1;
 }
 
 int
-initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
+initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog,
+			    map<string, size_t> &seen_samples, size_t &next_sample_id)
 {
     bool compressed;
 
@@ -1823,6 +1851,13 @@ initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
 
     next_catalog_id = max_catalog_id + 1;
 
+    //
+    // Parse the list of samples in the existing catalog.
+    //
+    if (!load_catalog_samples(catalog_path, seen_samples, next_sample_id))
+	throw exception();
+    next_sample_id++;
+
     return 1;
 }
 


=====================================
src/cstacks.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2018, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -58,8 +58,8 @@ using std::queue;
 void help( void );
 void version( void );
 int  parse_command_line(int, char**);
-int  initialize_new_catalog(pair<int, string> &, map<int, CLocus *> &);
-int  initialize_existing_catalog(string, map<int, CLocus *> &);
+int  initialize_new_catalog(pair<int, string> &, map<int, CLocus *> &, map<string, size_t> &, size_t &);
+int  initialize_existing_catalog(string, map<int, CLocus *> &, map<string, size_t> &, size_t &);
 int  update_catalog_index(map<int, CLocus *> &, map<string, int> &);
 int  merge_catalog_loci(map<int, CLocus *> &, vector<int> &);
 int  find_kmer_matches_by_sequence(map<int, CLocus *> &, map<int, QLocus *> &, int);
@@ -72,7 +72,7 @@ int  merge_matches(map<int, CLocus *> &, map<int, QLocus *> &, pair<int, string>
 int  add_unique_tag(pair<int, string> &, map<int, CLocus *> &, QLocus *);
 bool compare_dist(pair<int, int>, pair<int, int>);
 
-int  write_catalog(map<int, CLocus *> &);
+int  write_catalog(map<int, CLocus *> &, map<string, size_t> &);
 int  write_simple_output(CLocus *, ofstream &, ofstream &, ofstream &);
 int  write_gzip_output(CLocus *, gzFile &, gzFile &, gzFile &);
 


=====================================
src/export_formats.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2012-2022, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2012-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -283,7 +283,9 @@ SumstatsExport::write_batch(const vector<LocBin *> &loci)
                               << s->nucs[pos].smoothed(1)  << "\t"  // Smoothed Fis
                               << s->nucs[pos].bs(1)        << "\t"  // Fis bootstrapped p-value.
                               << s->nucs[pos].stat(2)      << "\t"; // HWE p-value.
-                    (t->nucs[pos].priv_allele == (int) pop) ? this->_fh << "1\n" : this->_fh << "0\n";
+		    //t->nucs[pos].priv_allele == (int) pop ? this->_fh << "1\n" : this->_fh << "0\n";
+		    (t->nucs[pos].priv_allele == (int) pop ||
+		     t->nucs[pos].priv_allele == two_priv_alleles) ? this->_fh << "1\n" : this->_fh << "0\n";
                 }
             }
         }
@@ -2646,11 +2648,17 @@ VcfExport::write_site(const CSLocus* cloc,
             // DP.
             sample << ":" << d[s]->snpdata[index].tot_depth;
             // AD.
-            if (d[s]->snpdata[index].nt_depths.sum() > 0)
+            if (d[s]->snpdata[index].nt_depths.sum() > 0) {
                 sample << ":" << d[s]->snpdata[index].nt_depths[Nt2(ref)]
                        << "," << d[s]->snpdata[index].nt_depths[Nt2(alt)];
-            else
-                sample << ":.";
+	    } else {
+		//
+		// If we arrive here, there are no nucleotides present for this individual at this
+		// site, which means either there are Ns here or a nucleotide that was not part of
+		// the genotype call and was discarded.
+		//
+                sample << ":0,0";
+	    }
             // GQ.
             assert(d[s]->snpdata[index].gq != UINT8_MAX);
             sample << ':' << int(d[s]->snpdata[index].gq);


=====================================
src/populations.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2012-2022, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2012-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -161,6 +161,11 @@ try {
              << in_path << "' as a single popultaion.\n";
     }
 
+    if (bs_wl_file.length() > 0) {
+        load_marker_list(bs_wl_file, bootstraplist);
+        cerr << "Loaded " << bootstraplist.size() << " markers to include when bootstrapping.\n";
+    }
+
     //
     // Locate and open input files, read VCF headers, parse population map, load black/white lists.
     //
@@ -806,6 +811,34 @@ BatchLocusProcessor::init_external_loci(string in_path, string pmap_path)
     cout << "Opening the VCF file...\n";
     this->_vcf_parser.open(in_path);
 
+    //
+    // If this VCF input file contained contig definitions, move them to the popmap for later export.
+    //
+    const vector<VcfMeta>& vm = this->_vcf_parser.header().meta();
+    for (auto &vmit: vm) {
+	if (vmit.key() == "contig") {
+	    //
+	    // Parse a string that looks like: <ID=NW_024121099.1,length=59790976>
+	    //
+	    const char *p, *q;
+	    for (p = vmit.value().c_str(); *p != '\0' && *p != '='; p++);
+	    for (q = p + 1;                *q != '\0' && *q != ','; q++);
+	    if (*p == '\0' || *q == '\0') {
+		cerr << "Error parsing VCF header line: ##" << vmit.key() << "=" << vmit.value() << "\n";
+		throw exception();
+	    }
+	    string key = string(p+1, q);
+	    for (p = q + 1; *p != '\0' && *p != '='; p++);
+	    for (q = p + 1; *q != '\0' && *q != '>'; q++);
+	    if (*p == '\0' || *q == '\0') {
+		cerr << "Error parsing VCF header line: ##" << vmit.key() << "=" << vmit.value() << "\n";
+		throw exception();
+	    }
+	    string val = string(p+1, q);
+	    this->_mpopi->add_contig(key, val);
+	}
+    }
+    
     if (this->_vcf_parser.header().samples().empty()) {
         cerr << "Error: No samples in VCF file '" << in_path << "'.\n";
         throw exception();
@@ -1084,13 +1117,13 @@ LocusFilter::erase_snp(CSLocus *cloc, Datum **d, size_t n_samples, size_t snp_in
         assert(d[s]->snpdata.size() == cloc->snps.size());
         
         //
-        // Correct the model calls.
+        // Update the model calls.
         //
         char& m = d[s]->model[col];
         assert(m == 'O' || m == 'E' || m == 'U');
         if (m == 'E' || (m == 'O' && d[s]->obshap[0][snp_index] != cloc->con[col]))
             m = 'U';
-        //
+	//
         // Erase the nucleotide in the haplotypes.
         //
         for (char* hap : d[s]->obshap) {
@@ -1241,6 +1274,7 @@ LocusFilter::apply_filters_stacks(LocBin& loc, ostream& log_fh, const MetaPopInf
     //
     loc.s = new LocPopSum(strlen(loc.cloc->con), mpopi);
     this->filter_snps(loc, mpopi, log_fh);
+    this->filter_fixed_sites(loc, mpopi, log_fh);
 
     //
     // If write_single_snp, write_random_snp or filter_haplotype_wise has been specified,
@@ -1483,7 +1517,7 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
         // If the site is fixed, ignore it.
         //
         NucTally& nuct = t->nucs[col];
-        if (t->nucs[col].fixed == true)
+        if (nuct.fixed == true)
             continue;
 
         size_t n_pruned_pops = 0;
@@ -1512,7 +1546,7 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
         else if ((double) n_samples_left / mpopi.n_samples() < min_samples_overall)
             overall_sample_prune = true;
 
-        if (t->nucs[col].allele_cnt > 1) {
+        if (nuct.allele_cnt > 1) {
             //
             // Test for minor allele frequency.
             //
@@ -1522,7 +1556,7 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
             //
             // Test for observed heterozygosity.
             //
-            if (t->nucs[col].obs_het > max_obs_het)
+            if (nuct.obs_het > max_obs_het)
                 het_prune = true;
         }
 
@@ -1550,6 +1584,77 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
     }
 }
 
+void
+LocusFilter::filter_fixed_sites(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
+{
+    assert(loc.s != NULL);
+    CSLocus *cloc = loc.cloc;
+    Datum     **d = loc.d;
+    LocPopSum  *s = loc.s;
+
+    loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
+    loc.s->tally_metapop(loc.cloc);
+    size_t snp_index = 0;
+
+    for (uint col = 0; col < cloc->len; col++) {
+        //
+        // If the site is polymorphic, ignore it.
+        //
+	if (snp_index < cloc->snps.size() && cloc->snps[snp_index]->col == col) {
+	    snp_index++;
+	    continue;
+	}
+	
+        size_t n_pruned_pops  = 0;
+        size_t n_samples_left = 0;
+
+        for (size_t p = 0; p < s->pop_cnt(); ++p) {
+            const LocSum* sum = s->per_pop(p);
+
+            if ((double) sum->nucs[col].num_indv() / this->_pop_tot[p] >= min_samples_per_pop) {
+                n_samples_left += sum->nucs[col].num_indv();
+            } else {
+                ++n_pruned_pops;
+
+                const Pop& pop = mpopi.pops()[p];
+                for (uint s = pop.first_sample; s <= pop.last_sample; ++s) {
+                    if (d[s] == NULL || col >= (uint) d[s]->len)
+                        continue;
+		    
+		    assert(d[s]->model);
+		    //
+		    // Update the model calls.
+		    //
+		    d[s]->model[col] = 'U';
+                }
+            }
+        }
+
+	bool sample_prune         = false;
+	bool overall_sample_prune = false;
+
+	if (mpopi.pops().size() - n_pruned_pops < (uint) min_populations)
+            sample_prune = true;
+        else if ((double) n_samples_left / mpopi.n_samples() < min_samples_overall)
+	    overall_sample_prune = true;
+
+	if (sample_prune || overall_sample_prune) {
+            this->_filtered_sites++;
+
+	    for (size_t s = 0; s < mpopi.n_samples(); ++s) {
+		if (d[s] == NULL || col >= (uint) d[s]->len)
+		    continue;
+
+		assert(d[s]->model);
+		//
+		// Update the model calls.
+		//
+		d[s]->model[col] = 'U';
+	    }
+	}
+    }
+}
+
 void
 LocusFilter::filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
 {
@@ -2666,6 +2771,15 @@ SumStatsSummary::accumulate(const vector<LocBin *> &loci)
             //
             if (t->nucs[pos].priv_allele >= 0)
                 _private_cnt[t->nucs[pos].priv_allele]++;
+	    
+	    else if (t->nucs[pos].priv_allele == two_priv_alleles) {
+		// If there are two private alleles at this position, find the two populations they occurred in.
+		for (uint pop = 0; pop < this->_pop_cnt; pop++) {
+                    s = loci[i]->s->per_pop(pop);
+                    if (s->nucs[pos].num_indv() > 0)
+			_private_cnt[pop]++;
+		}
+	    }
 
             if (t->nucs[pos].allele_cnt == 2) {
                 site_cnt++;


=====================================
src/populations.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2012-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2012-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -254,6 +254,7 @@ public:
     bool   filter(const MetaPopInfo *mpopi, Datum **d, CSLocus *cloc);
     void   filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
     void   filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
+    void   filter_fixed_sites(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
     void   gt_depth_filter(Datum **d, const CSLocus *cloc);
     void   keep_single_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const;
     void   keep_random_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const;


=====================================
src/process_radtags.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2011-2022, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -319,7 +319,7 @@ initialize_counters(map<string, long> &counter)
     counter["retained"]     = 0;
     counter["recovered"]    = 0;
     counter["transposed"]   = 0;
-
+    counter["proppair"]     = 0;
     return 0;
 }
 
@@ -339,6 +339,7 @@ consolidate_counters(map<string, long> &counter,
 	counter["retained"]     += cntr["retained"];
 	counter["recovered"]    += cntr["recovered"];
 	counter["transposed"]   += cntr["transposed"];
+	counter["proppair"]     += cntr["proppair"];
     }
 
     return 0;
@@ -358,11 +359,13 @@ consolidate_barcode_logs(map<BarcodePair, map<string, long>> &barcode_log,
 		barcode_log[bc]["total"]    = 0;
 		barcode_log[bc]["low_qual"] = 0;
 		barcode_log[bc]["retained"] = 0;
+		barcode_log[bc]["proppair"] = 0;
 	    }
 	    barcode_log[bc]["noradtag"] += it->second["noradtag"];
 	    barcode_log[bc]["total"]    += it->second["total"];
 	    barcode_log[bc]["low_qual"] += it->second["low_qual"];
 	    barcode_log[bc]["retained"] += it->second["retained"];
+	    barcode_log[bc]["proppair"] += it->second["proppair"];
 	}
     }
 
@@ -534,6 +537,9 @@ process_paired_reads(string prefix_1,
         int result_2 = 1;
 
         if (r_1->retain && r_2->retain) {
+	    counter["proppair"]++;
+	    barcode_log[bc]["proppair"]++;
+	    
             if (retain_header) {
                 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
                     write_fastq(pair_1_fhs[bc], s_1, r_1) :
@@ -1021,6 +1027,11 @@ process_paired_reads_parallel(string prefix_1,
 		if (r_2->retain)
 		    process_singlet(r_2, renz_2, true, blog[*bc], cntr);
 
+		if (r_1->retain && r_2->retain) {
+		    cntr["proppair"]++;
+		    blog[*bc]["proppair"]++;
+		}
+		
 		//
 		// Store the result in the output buffer for writing.
 		//
@@ -1911,6 +1922,7 @@ print_results(int argc, char **argv,
     c["ambiguous"]    = 0;
     c["noradtag"]     = 0;
     c["transposed"]   = 0;
+    c["proppair"]     = 0;
     
     //
     // Total up the individual counters
@@ -1924,6 +1936,7 @@ print_results(int argc, char **argv,
         c["noradtag"]     += it->second["noradtag"];
         c["retained"]     += it->second["retained"];
 	c["transposed"]   += it->second["transposed"];
+	c["proppair"]     += it->second["proppair"];
     }
 
     auto print_nreads = [&c] (long n, const string& legend) {
@@ -1970,8 +1983,10 @@ print_results(int argc, char **argv,
     log << "Barcode Not Found\t"     << c["ambiguous"]   << "\t" << as_percentage((double) c["ambiguous"] / c["total"]) << "\n"
         << "Low Quality\t"           << c["low_quality"] << "\t" << as_percentage((double) c["low_quality"] / c["total"]) << "\n"
         << "RAD Cutsite Not Found\t" << c["noradtag"]    << "\t" << as_percentage((double) c["noradtag"] / c["total"]) << "\n"
-        << "Retained Reads\t"        << c["retained"]    << "\t" << as_percentage((double) c["retained"] / c["total"]) << "\n"
-	<< "END total_raw_read_counts" << "\n";
+        << "Retained Reads\t"        << c["retained"]    << "\t" << as_percentage((double) c["retained"] / c["total"]) << "\n";
+    if (paired)    
+	log << "Properly Paired\t"   << c["proppair"]    << "\t" << as_percentage(((double) c["proppair"]*2.0) / c["total"]) << "\n";
+    log << "END total_raw_read_counts" << "\n";
 
     if (max_bc_size_1 == 0) return 0;
 
@@ -1997,8 +2012,11 @@ print_results(int argc, char **argv,
         << "RAD Cutsite Not Found\t"
         << "Low Quality\t"
         << "Retained Reads\t"
-	<< "Pct Retained\t"
-	<< "Pct Total Reads\n";
+	<< "Pct Retained\t";
+    if (paired)
+	log << "Properly Paired\t"
+	    << "Pct Properly Paired\t";
+    log << "Pct Total Reads\n";
 
     set<BarcodePair> barcode_list;
 
@@ -2011,13 +2029,17 @@ print_results(int argc, char **argv,
 
         if (barcode_log.count(barcodes[i]) == 0)
             log << "0\t" << "0\t" << "0\t" << "0\n";
-        else
+        else {
             log << barcode_log[barcodes[i]]["total"]    << "\t"
                 << barcode_log[barcodes[i]]["noradtag"] << "\t"
                 << barcode_log[barcodes[i]]["low_qual"] << "\t"
                 << barcode_log[barcodes[i]]["retained"] << "\t"
-		<< as_percentage((double) barcode_log[barcodes[i]]["retained"] / barcode_log[barcodes[i]]["total"]) << "\t"
-		<< as_percentage((double) barcode_log[barcodes[i]]["retained"] / c["total"]) << "\n";
+		<< as_percentage((double) barcode_log[barcodes[i]]["retained"] / barcode_log[barcodes[i]]["total"]) << "\t";
+	    if (paired)
+		log << barcode_log[barcodes[i]]["proppair"] << "\t"
+		    << as_percentage(((double) barcode_log[barcodes[i]]["proppair"]*2.0) / barcode_log[barcodes[i]]["total"]) << "\t";
+	    log << as_percentage((double) barcode_log[barcodes[i]]["retained"] / c["total"]) << "\n";
+	}
     }
 
     log << "END per_barcode_raw_read_counts" << "\n"


=====================================
src/renz.cc
=====================================
@@ -8,6 +8,8 @@ using namespace std;
 //
 const char *aciI[]    = {"CGC", "CGG",        // C/CGC, AciI
                          "GCG", "CCG"};
+const char *aclI[]    = {"CGTT",              // AA/CGTT, AclI
+                         "GCAA"};
 const char *ageI[]    = {"CCGGT",             // A/CCGGT, AgeI
                          "ACCGG"};
 const char *aluI[]    = {"CT",                // AG/CT, AluI
@@ -56,6 +58,8 @@ const char *ecoRV[]   = {"ATC",               // GAT/ATC, EcoRV
                          "GAT"};
 const char *ecoT22I[] = {"TGCAT",             // A/TGCAT, EcoT22I
                          "ATGCA"};
+const char *haeII[]   = {"GCGCA", "GCGCG",    // RGCGC/Y, HaeII
+                         "TGCGC", "CGCGC"};
 const char *haeIII[]  = {"CC",                // GG/CC, HaeIII
                          "GG"};
 const char *hhaI[]    = {"GCG",               // GCG/C, HhaI
@@ -150,6 +154,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
     renz["sgrAI"]   = sgrAI;   // CR/CCGGYG, SgrAI; R=A or G; Y=C or T
     renz["apeKI"]   = apeKI;   // G/CWGC, ApeKI; W=A or T
     renz["hindIII"] = hindIII; // A/AGCTT, HindIII
+    renz["haeII"]   = haeII;   // RGCGC/Y, HaeII
     renz["haeIII"]  = haeIII;  // GG/CC, HaeIII
     renz["dpnII"]   = dpnII;   // GATC, DpnII
     renz["sphI"]    = sphI;    // GCATG/C, SphI
@@ -183,6 +188,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
     renz["ageI"]    = ageI;    // A/CCGGT, AgeI
     renz["rsaI"]    = rsaI;    // GT/AC, RsaI
     renz["aciI"]    = aciI;    // C/CGC, AciI
+    renz["aclI"]    = aclI;    // AA/CGTT, AclI
     renz["bfaI"]    = bfaI;    // C/TAG, BfaI
     renz["bfuCI"]   = bfuCI;   // /GATC, BfuCI
     renz["aseI"]    = aseI;    // AT/TAAT, AseI
@@ -210,6 +216,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
     renz_cnt["sgrAI"]   = 2;
     renz_cnt["apeKI"]   = 2;
     renz_cnt["hindIII"] = 1;
+    renz_cnt["haeII"]   = 2;
     renz_cnt["haeIII"]  = 1;
     renz_cnt["dpnII"]   = 1;
     renz_cnt["sphI"]    = 1;
@@ -243,6 +250,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
     renz_cnt["ageI"]    = 1;
     renz_cnt["rsaI"]    = 1;
     renz_cnt["aciI"]    = 2;
+    renz_cnt["aclI"]    = 1;
     renz_cnt["bfaI"]    = 1;
     renz_cnt["bfuCI"]   = 1;
     renz_cnt["aseI"]    = 1;
@@ -260,7 +268,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
     renz_cnt["pspXI"]   = 3;
     renz_cnt["hpyCH4IV"] = 1;
     renz_cnt["ngoMIV"]  = 1;
-    
+
     renz_len["sbfI"]    = 6;
     renz_len["pstI"]    = 5;
     renz_len["pstIshi"] = 4;
@@ -271,6 +279,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
     renz_len["apeKI"]   = 4;
     renz_len["hindIII"] = 5;
     renz_len["haeIII"]  = 2;
+    renz_len["haeII"]   = 5;
     renz_len["dpnII"]   = 4;
     renz_len["sphI"]    = 5;
     renz_len["nlaIII"]  = 4;
@@ -303,6 +312,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
     renz_len["ageI"]    = 5;
     renz_len["rsaI"]    = 2;
     renz_len["aciI"]    = 3;
+    renz_len["aclI"]    = 4;
     renz_len["bfaI"]    = 3;
     renz_len["bfuCI"]   = 4;
     renz_len["aseI"]    = 4;


=====================================
src/sql_utilities.cc
=====================================
@@ -2,7 +2,9 @@
 
 using namespace std;
 
-void load_catalog_matches(string sample,  vector<CatMatch *> &matches, bool verbose) {
+void
+load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verbose)
+{
     CatMatch      *m;
     string         f;
     vector<string> parts;
@@ -54,8 +56,8 @@ void load_catalog_matches(string sample,  vector<CatMatch *> &matches, bool verb
             throw exception();
         }
 
-        char c = parts[3].at(0);
-        if (parts[3] != "consensus" && c != 'A' && c != 'C' && c != 'G' && c != 'T') {
+        char c = parts[2].at(0);
+        if (parts[2] != "consensus" && c != 'A' && c != 'C' && c != 'G' && c != 'T') {
             // This sample locus was blacklisted, because:
             // "multi": it matches multiple c-loci
             // "extra_snp": it has a SNP unknown to the catalog
@@ -66,13 +68,13 @@ void load_catalog_matches(string sample,  vector<CatMatch *> &matches, bool verb
 
         m = new CatMatch;
         m->cat_id    = atoi(parts[0].c_str());
-        m->sample_id = atoi(parts[1].c_str());
-        m->tag_id    = atoi(parts[2].c_str());
-        m->haplotype = new char[parts[3].length() + 1];
-        strcpy(m->haplotype, parts[3].c_str());
-        m->depth     = atoi(parts[4].c_str());
-        m->cigar     = new char[parts[5].length() + 1];
-        strcpy(m->cigar, parts[5].c_str());
+        ////m->sample_id = atoi(parts[1].c_str());
+        m->tag_id    = atoi(parts[1].c_str());
+        m->haplotype = new char[parts[2].length() + 1];
+        strcpy(m->haplotype, parts[2].c_str());
+        m->depth     = atoi(parts[3].c_str());
+        m->cigar     = new char[parts[4].length() + 1];
+        strcpy(m->cigar, parts[4].c_str());
 
         matches.push_back(m);
     }
@@ -86,7 +88,86 @@ void load_catalog_matches(string sample,  vector<CatMatch *> &matches, bool verb
     return;
 }
 
-int load_model_results(string sample,  map<int, ModRes *> &modres) {
+int
+load_catalog_samples(string path, map<string, size_t> &seen_samples, size_t &max_sample_id)
+{
+    string         f;
+    vector<string> parts;
+    long int       line_num;
+    ifstream       fh;
+    gzFile         gz_fh;
+
+    char *line      = (char *) malloc(sizeof(char) * max_len);
+    int   size      = max_len;
+    int   cnt       = 0;
+    bool  gzip      = false;
+    int   fh_status = 1;
+
+    f = path + ".sample_list.tsv";
+    fh.open(f.c_str(), ifstream::in);
+    if (fh.fail()) {
+        //
+        // Test for a gzipped file.
+        //
+        f = path + ".sample_list.tsv.gz";
+        gz_fh = gzopen(f.c_str(), "rb");
+        if (!gz_fh)
+            return -1;
+
+        #if ZLIB_VERNUM >= 0x1240
+        gzbuffer(gz_fh, libz_buffer_size);
+        #endif
+        gzip = true;
+    }
+
+    max_sample_id = 0;
+    line_num      = 1;
+
+    while (fh_status) {
+        fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
+        line_num++;
+
+        if (!fh_status && strlen(line) == 0)
+            continue;
+
+        if (is_comment(line)) continue;
+
+        parse_tsv(line, parts);
+
+        cnt = parts.size();
+
+        if (cnt != num_samlist_fields) {
+            cerr << "Error parsing '" << f.c_str() << "' at line: " << line_num << ". (" << parts.size() << " fields).\n";
+            throw exception();
+        }
+
+	int sample_id = is_integer(parts[1].c_str());
+	if (sample_id > max_sample_id)
+	    max_sample_id = sample_id;
+
+	string sample_name = parts[0];
+	if (seen_samples.count(sample_name) > 0) {
+	    // Insert failed.
+            cerr << "Error: Sample '" << sample_name << "' appears more than one time in this catalog. Samples must be unique." << endl;
+            throw exception();
+        } else {
+	    seen_samples[sample_name] = sample_id;
+	}
+    }
+
+    if (gzip)
+        gzclose(gz_fh);
+    else
+        fh.close();
+    free(line);
+
+    return 1;
+}
+
+int
+load_model_results(string sample,  map<int, ModRes *> &modres)
+{
+    //// Deprecated
     string         f;
     vector<string> parts;
     long int       line_num;
@@ -207,6 +288,7 @@ int load_model_results(string sample,  map<int, ModRes *> &modres) {
 }
 
 int load_snp_calls(string sample,  map<int, SNPRes *> &snpres) {
+    //// Deprecated
     string         f;
     int id, samp_id;
     vector<string> parts;


=====================================
src/sql_utilities.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -32,14 +32,20 @@
 //
 // The expected number of tab-separated fields in our SQL input files.
 //
-const uint num_tags_fields    = 9;
-const uint num_snps_fields    = 9;
-const uint num_alleles_fields = 5;
-const uint num_matches_fields = 6;
-
-void load_catalog_matches(string sample,  vector<CatMatch *> &matches, bool verbose=true);
-int load_model_results(string sample,  map<int, ModRes *> &modres);
-int load_snp_calls(string sample,  map<int, SNPRes *> &snpres);
+////const uint num_tags_fields    = 9;
+////const uint num_snps_fields    = 9;
+////const uint num_alleles_fields = 5;
+////const uint num_matches_fields = 6;
+const uint num_tags_fields    = 8;
+const uint num_snps_fields    = 8;
+const uint num_alleles_fields = 4;
+const uint num_matches_fields = 5;
+const uint num_samlist_fields = 2;
+
+void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verbose=true);
+int load_catalog_samples(string path, map<string, size_t> &seen_samples, size_t &max_sample_id);
+int load_model_results(string sample, map<int, ModRes *> &modres);
+int load_snp_calls(string sample, map<int, SNPRes *> &snpres);
 
 // retrieve_bijective_sloci()
 // ----------
@@ -126,9 +132,9 @@ load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool
             return 0;
         }
 
-        id = atoi(parts[1].c_str());
+        id = atoi(parts[0].c_str());
 
-        if (parts[2] != "consensus") {
+        if (parts[1] != "consensus") {
             if (blacklisted.count(id)) continue;
 
             //
@@ -140,9 +146,9 @@ load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool
                 // Read the model sequence, a series of letters specifying if the model called a
                 // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
                 //
-                if (parts[2] == "model") {
-                    loci[id]->model = new char[parts[5].length() + 1];
-                    strcpy(loci[id]->model, parts[5].c_str());
+                if (parts[1] == "model") {
+                    loci[id]->model = new char[parts[4].length() + 1];
+                    strcpy(loci[id]->model, parts[4].c_str());
 
                 } else {
                     //
@@ -153,23 +159,23 @@ load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool
                     if (store_reads >= 1) {
                         if (store_reads >= 2) {
                             // Load the actual sequences (otherwise don't).
-                            char *read = new char[parts[5].length() + 1];
-                            strcpy(read, parts[5].c_str());
+                            char *read = new char[parts[4].length() + 1];
+                            strcpy(read, parts[4].c_str());
                             loci[id]->reads.push_back(read);
                         }
 
-                        char *read_id = new char[parts[4].length() + 1];
-                        strcpy(read_id, parts[4].c_str());
+                        char *read_id = new char[parts[3].length() + 1];
+                        strcpy(read_id, parts[3].c_str());
                         loci[id]->comp.push_back(read_id);
                         //
                         // Store the internal stack number for this read.
                         //
-                        loci[id]->comp_cnt.push_back(atoi(parts[3].c_str()));
+                        loci[id]->comp_cnt.push_back(atoi(parts[2].c_str()));
 
                         //
                         // Store the read type.
                         //
-                        if (parts[2] == "primary")
+                        if (parts[1] == "primary")
                             loci[id]->comp_type.push_back(primary);
                         else
                             loci[id]->comp_type.push_back(secondary);
@@ -187,26 +193,26 @@ load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool
         // Do not include blacklisted tags in the catalog. They are tags that are composed
         // of noise and/or repetitive sequence.
         //
-        if (parts[7] == "1") {
+        if (parts[6] == "1") {
             blacklisted.insert(id);
             continue;
         }
 
         c = new LocusT;
-        c->sample_id = atoi(parts[0].c_str());
+        ////c->sample_id = atoi(parts[0].c_str());
         c->id        = id;
-        c->add_consensus(parts[5].c_str());
+        c->add_consensus(parts[4].c_str());
 
         //
         // Read in the flags
         //
-        c->deleveraged     = (parts[6] == "1" ? true : false);
-        c->lumberjackstack = (parts[8] == "1" ? true : false);
+        c->deleveraged     = (parts[5] == "1" ? true : false);
+        c->lumberjackstack = (parts[7] == "1" ? true : false);
 
         //
         // Parse the components of this stack (either the Illumina ID, or the catalog constituents)
         //
-        q = parts[4].c_str();
+        q = parts[3].c_str();
         while (*q != '\0') {
             for (p = q; *q != ',' && *q != '\0'; q++);
             len = q - p;
@@ -270,7 +276,7 @@ load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool
             return 0;
         }
 
-        id = atoi(parts[1].c_str());
+        id = atoi(parts[0].c_str());
 
         if (blacklisted.count(id))
             continue;
@@ -278,31 +284,31 @@ load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool
         //
         // Only load heterozygous model calls.
         //
-        if (load_all_model_calls == false && parts[3] != "E")
+        if (load_all_model_calls == false && parts[2] != "E")
             continue;
 
         snp         = new SNP;
-        snp->col    = atoi(parts[2].c_str());
-        snp->lratio = atof(parts[4].c_str());
-        snp->rank_1 = parts[5].at(0);
-        snp->rank_2 = parts[6].at(0) == '-' ? 0 : parts[6].at(0);
+        snp->col    = atoi(parts[1].c_str());
+        snp->lratio = atof(parts[3].c_str());
+        snp->rank_1 = parts[4].at(0);
+        snp->rank_2 = parts[5].at(0) == '-' ? 0 : parts[5].at(0);
 
-        if (parts[3] == "E")
+        if (parts[2] == "E")
             snp->type = snp_type_het;
-        else if (parts[3] == "O")
+        else if (parts[2] == "O")
             snp->type = snp_type_hom;
         else
             snp->type = snp_type_unk;
 
-        if (parts[7].length() == 0 || parts[7].at(0) == '-')
+        if (parts[6].length() == 0 || parts[6].at(0) == '-')
             snp->rank_3 = 0;
         else
-            snp->rank_3 = parts[7].at(0);
+            snp->rank_3 = parts[6].at(0);
 
-        if (parts[8].length() == 0 || parts[8].at(0) == '-')
+        if (parts[7].length() == 0 || parts[7].at(0) == '-')
             snp->rank_4 = 0;
         else
-            snp->rank_4 = parts[8].at(0);
+            snp->rank_4 = parts[7].at(0);
 
         if (loci.count(id) > 0) {
             loci[id]->snps.push_back(snp);
@@ -362,13 +368,13 @@ load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool
             return 0;
         }
 
-        id = atoi(parts[1].c_str());
+        id = atoi(parts[0].c_str());
 
         if (blacklisted.count(id))
             continue;
 
         if (loci.count(id) > 0) {
-            loci[id]->alleles[parts[2]] = atoi(parts[4].c_str());
+            loci[id]->alleles[parts[1]] = atoi(parts[3].c_str());
         } else {
             cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". SNP asks for nonexistent locus with ID: " << id << "\n";
             return 0;


=====================================
src/sstacks.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -1261,7 +1261,6 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
                     qloc->alleles[qloc->matches[j]->cat_type] : qloc->depth;
 
             sstr << qloc->matches[j]->cat_id   << "\t"
-                 << samp_id                    << "\t"
                  << qloc->id                   << "\t"
                  << qloc->matches[j]->cat_type << "\t"
                  << match_depth                << "\t"


=====================================
src/tsv2bam.cc
=====================================
@@ -1,3 +1,22 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2016-2023, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
+//
 #include <cmath>
 #include <cstring>
 #include <iostream>
@@ -21,7 +40,8 @@
 void parse_command_line(int argc, char* argv[]);
 void report_options(ostream& os);
 int  run();
-void run(const vector<int>& cloc_ids,
+void run(const map<string, size_t> &sample_ids,
+	 const vector<int>& cloc_ids,
          const string& header_sq_lines,
          const unordered_map<int,size_t>& cloc_lengths,
          size_t sample_i, ostream& os
@@ -31,13 +51,14 @@ void cigar_apply_to_locus(Locus* l, const Cigar& c);
 //
 // Argument globals.
 //
-bool quiet = false;
-string in_dir;
+string         in_dir;
 vector<string> samples;
-string popmap_path;
+string         popmap_path;
 vector<string> pe_reads_paths;
-FileT pe_reads_format;
-int num_threads = 1;
+FileT          pe_reads_format;
+
+bool quiet       = false;
+int  num_threads = 1;
 
 bool dbg_reversed_pe_reads = false;
 bool dbg_only_bijective_loci = false;
@@ -50,7 +71,9 @@ unique_ptr<LogAlterator> logger = NULL;
 
 // main()
 // ==========
-int main(int argc, char* argv[]) {
+int
+main(int argc, char* argv[])
+{
     IF_NDEBUG_TRY
 
     // Parse arguments
@@ -80,26 +103,46 @@ int main(int argc, char* argv[]) {
     IF_NDEBUG_CATCH_ALL_EXCEPTIONS
 }
 
-int run() {
-
+int
+run()
+{
     //
     // Read the catalog.
     //
     cout << "Loading the catalog..." << endl;
-    map<int, Locus*> catalog;
+    map<int, Locus*>    catalog;
+    map<string, size_t> sample_ids;
+
     string catalog_prefix = in_dir + "catalog";
-    bool dummy;
-    int rv = load_loci(catalog_prefix, catalog, 0, false, dummy, false);
-    if (rv != 1) {
+    bool   compressed;
+    size_t max_sample_id  = 0;
+    size_t next_sample_id = 1;
+    
+    if (!load_loci(catalog_prefix, catalog, 0, false, compressed, false)) {
         cerr << "Error: Unable to load catalog '" << catalog_prefix << "'.\n";
         throw exception();
     }
-
+    //
+    // Assign arbitrary sample IDs to the incomming samples, respecting the IDs already assigend by cstacks.
+    // Since not all samples have to be part of the catalog, we may have additional samples now being
+    // processed that are not in the catalog and do not have an existing ID.
+    //
+    if (!load_catalog_samples(catalog_prefix, sample_ids, max_sample_id)) {
+	cerr << "Error: Unable to load catalog sample list '" << catalog_prefix << "'.\n";
+	throw exception();
+    }
+    next_sample_id = max_sample_id + 1;
+    for (size_t i = 0; i < samples.size(); ++i)
+	if (sample_ids.count(samples[i]) == 0) {
+	    sample_ids[samples[i]] = next_sample_id;
+	    next_sample_id++;
+	}
+    
     //
     // Create the BAM target list.
     //
     stringstream header_sq_lines_ss;
-    vector<int> cloc_ids;
+    vector<int>  cloc_ids;
     unordered_map<int,size_t> cloc_lengths;
     cloc_ids.reserve(catalog.size());
     for (auto& cloc : catalog) {
@@ -125,7 +168,7 @@ int run() {
     #pragma omp parallel
     {
         #pragma omp for schedule(dynamic)
-        for (size_t i=0; i<samples.size(); ++i) {
+        for (size_t i = 0; i < samples.size(); ++i) {
             if (omp_return != 0)
                 continue;
             try {
@@ -133,7 +176,7 @@ int run() {
                 cout << "Processing sample '" << samples[i] << "'...\n" << flush;
 
                 stringstream ss;
-                run(cloc_ids, header_sq_lines, cloc_lengths, i, ss);
+                run(sample_ids, cloc_ids, header_sq_lines, cloc_lengths, i, ss);
                 outputs[i] = ss.str();
 
             } catch (exception& e) {
@@ -155,12 +198,14 @@ int run() {
     return 0;
 }
 
-void run(const vector<int>& cloc_ids,
+void
+run(const map<string, size_t> &sample_ids,
+    const vector<int>& cloc_ids,
     const string& header_sq_lines,
     const unordered_map<int,size_t>& cloc_lengths,
     size_t sample_i,
-    ostream& os
-) {
+    ostream& os)
+{
     ostream& cout = os;
     cout << "Sample '" << samples.at(sample_i) << "':";
 
@@ -171,7 +216,7 @@ void run(const vector<int>& cloc_ids,
     //
     map<int, Locus*> sloci;
     vector<CatMatch*> matches;
-    int sample_id = -1;
+    int sample_id = sample_ids.at(samples.at(sample_i));
     {
         load_catalog_matches(prefix_path, matches, false);
         if (matches.empty()) {
@@ -179,7 +224,7 @@ void run(const vector<int>& cloc_ids,
                  << prefix_path + ".matches.tsv(.gz)'.\n";
             throw exception();
         }
-        sample_id = matches[0]->sample_id;
+        matches[0]->sample_id = sample_id;
 
         bool dummy;
         int rv = load_loci(prefix_path, sloci, 2, false, dummy, false);


=====================================
src/ustacks.cc
=====================================
@@ -71,7 +71,6 @@ try{
 
     cerr << "ustacks parameters selected:\n"
          << "  Input file: '" << in_file   << "'\n"
-         << "  Sample ID: "   << sample_id << "\n"
          << "  Min depth of coverage to create a stack (m): " << min_merge_cov << "\n"
          << "  Repeat removal algorithm: " << (remove_rep_stacks ? "enabled" : "disabled") << "\n"
          << "  Max distance allowed between stacks (M): " << max_utag_dist << "\n"
@@ -185,7 +184,7 @@ try{
     //
     // Assemble loci (merge primary stacks).
     //
-    cerr << "Assembling stacks (max. dist. M=" << max_utag_dist << ")...\n";
+    cerr << "Assembling stacks (max dist. M=" << max_utag_dist << ")...\n";
     calc_kmer_distance(merged, max_utag_dist);
     size_t n_blacklisted;
     merge_stacks(merged, n_blacklisted);
@@ -200,7 +199,7 @@ try{
     //
     // Merge secondary stacks.
     //
-    cerr << "Merging secondary stacks (max. dist. N=" << max_rem_dist << " from consensus)...\n";
+    cerr << "Merging secondary stacks (max dist. N=" << max_rem_dist << " from consensus)...\n";
     size_t utilized    = merge_remainders(merged, unique, remainders);
     call_consensus(merged, unique, remainders, false);
     size_t util_gapped = merge_gapped_remainders(merged, unique, remainders);
@@ -2378,8 +2377,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
         tag_1->calc_likelihood();
 
         // First write the consensus sequence
-        tags << sample_id          << "\t"
-             << tag_1->id          << "\t"
+	tags  << tag_1->id          << "\t"
              << "consensus\t"      << "\t"
              << "\t"
              << tag_1->con         << "\t"
@@ -2390,8 +2388,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
         //
         // Write a sequence recording the output of the SNP model for each nucleotide.
         //
-        tags << sample_id << "\t"
-             << tag_1->id << "\t"
+	tags << tag_1->id << "\t"
              << "model\t" << "\t"
              << "\t";
         for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
@@ -2418,8 +2415,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
             total += tag_2->count();
 
             for (uint j = 0; j < tag_2->map.size(); j++) {
-                tags << sample_id << "\t"
-                     << tag_1->id << "\t"
+		tags << tag_1->id << "\t"
                      << "primary\t"
                      << id << "\t"
                      << seq_ids[tag_2->map[j]] << "\t"
@@ -2436,8 +2432,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
             rem    = r[*k];
             total += rem->map.size();
             for (uint j = 0; j < rem->map.size(); j++)
-                tags << sample_id << "\t"
-                     << tag_1->id << "\t"
+		tags << tag_1->id << "\t"
                      << "secondary\t"
                      << "\t"
                      << seq_ids[rem->map[j]] << "\t"
@@ -2449,8 +2444,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
         // Write out the model calls for each nucleotide in this locus.
         //
         for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
-            snps << sample_id << "\t"
-                 << tag_1->id << "\t"
+	    snps << tag_1->id << "\t"
                  << (size_t) (*s)->col << "\t";
             switch((*s)->type) {
             case snp_type_het:
@@ -2474,8 +2468,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
         // the percentage of tags a particular allele occupies.
         //
         for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
-            alle << sample_id   << "\t"
-                 << tag_1->id   << "\t"
+	    alle << tag_1->id   << "\t"
                  << (*t).first  << "\t";
             sprintf(strbuf32, "%.2f", ((*t).second/total) * 100);
             alle << strbuf32 << "\t"
@@ -2859,7 +2852,6 @@ int parse_command_line(int argc, char* argv[]) {
             sample_name = optarg;
             break;
         case 'i':
-            sample_id = is_integer(optarg);
             break;
         case 'm':
             min_merge_cov = is_integer(optarg);
@@ -3016,11 +3008,6 @@ int parse_command_line(int argc, char* argv[]) {
     }
     prefix_path = out_path + sample_name;
 
-    if (sample_id < 0) {
-        cerr << "A unique sample ID must be provided (a positive integer).\n";
-        help();
-    }
-
     if (model_type == ::fixed && barcode_err_freq == 0) {
         cerr << "You must specify the barcode error frequency.\n";
         help();
@@ -3037,9 +3024,8 @@ void version() {
 
 void help() {
     cerr << "ustacks " << VERSION << "\n"
-         << "ustacks -f file_path -i id -o path [-M max_dist] [-m min_cov] [-p num_threads]" << "\n"
+         << "ustacks -f file_path -o path [-M max_dist] [-m min_cov] [-p num_threads]" << "\n"
          << "  f: input file path.\n"
-         << "  i: a unique integer ID for this sample.\n"
          << "  o: output path to write results.\n"
          << "  M: Maximum distance (in nucleotides) allowed between stacks (default 2).\n"
          << "  m: Minimum depth of coverage required to create a stack (default 3).\n"



View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/eaf87a65c105869ea307fe13c0b77c60692e1c80

-- 
View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/eaf87a65c105869ea307fe13c0b77c60692e1c80
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230829/2c9d293b/attachment-0001.htm>


More information about the debian-med-commit mailing list