[med-svn] [Git][med-team/stacks][upstream] New upstream version 2.65+dfsg
Lance Lin (@linqigang)
gitlab at salsa.debian.org
Tue Aug 29 17:05:32 BST 2023
Lance Lin pushed to branch upstream at Debian Med / stacks
Commits:
eaf87a65 by Lance Lin at 2023-08-29T20:52:57+07:00
New upstream version 2.65+dfsg
- - - - -
23 changed files:
- ChangeLog
- configure.ac
- scripts/denovo_map.pl
- scripts/stacks-integrate-alignments
- src/PopSum.cc
- src/PopSum.h
- src/Seq.h
- src/Vcf.cc
- src/Vcf.h
- src/clean.h
- src/constants.h
- src/cstacks.cc
- src/cstacks.h
- src/export_formats.cc
- src/populations.cc
- src/populations.h
- src/process_radtags.cc
- src/renz.cc
- src/sql_utilities.cc
- src/sql_utilities.h
- src/sstacks.cc
- src/tsv2bam.cc
- src/ustacks.cc
Changes:
=====================================
ChangeLog
=====================================
@@ -1,3 +1,36 @@
+Stacks 2.65 - August 18, 2023
+-----------------------------
+ Feature: Added a "properly paired" reads counter to process_radtags.
+ Feature: extended populations filtering parameters to apply to fixed nucleotide sites, this
+ is applied in exports such as --vcf-all.
+ Feature: denovo_map.pl now accepts a second population map if you have a large data set and
+ would like to only load a subset of samples into the catalog.
+ Feature: Updated ustacks, cstacks, sstacks, and tsv2bam to no longer require
+ external sample ID numbers (-i option to ustacks). The pipeline will now
+ internally generate IDs when necessary, but most parts of the pipeline do
+ not need them any longer.
+ Feature: If a VCF input file for populations contains contig definitions
+ for a reference genome, those contigs will now be properly exported in any
+ VCF exports.
+ Feature: Added HaeII restriction enzyme.
+
+Stacks 2.64 - March 5, 2023
+---------------------------
+ Bugfix: the VCF export from populations could contain illegal fields for samples that have a
+ genotype call but effectively have no reads (reads contain unknown alleles or Ns in that
+ position). This could throw errors if a user then tried to import the VCF to popoulations
+ using --in-vcf.
+ Bugfix: private alleles. In the case of having two populations, where a particular site was differentally
+ fixed between the populations, both alleles would not be marked as private (neither would).
+ Now both alleles are marked as privte.
+
+Stacks 2.63 - October 23, 2022
+------------------------------
+ Feature: added AslI restriction enzyme.
+ Bugfix: fixed error in stacks-integrate-alns script which could cause a
+ failure when processing single-end data ('contig' is not defined in
+ catalog.fa.gz for single-end data).
+
Stacks 2.62 - June 28, 2022
---------------------------
Feature: added a '--vcf-all' export to populations which will export fixed
=====================================
configure.ac
=====================================
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT([Stacks], [2.62])
+AC_INIT([Stacks], [2.65])
AC_CONFIG_AUX_DIR([config])
AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
AC_CONFIG_SRCDIR([src/ustacks.cc])
=====================================
scripts/denovo_map.pl
=====================================
@@ -1,6 +1,6 @@
#!/usr/bin/env perl
#
-# Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+# Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
#
# This file is part of Stacks.
#
@@ -30,6 +30,7 @@ use constant false => 0;
my $dry_run = false;
my $exe_path = "_BINDIR_";
my $out_path = "";
+my $cpopmap_path = "";
my $popmap_path = "";
my $sample_path = "";
my $db = "";
@@ -64,6 +65,7 @@ my ($log, $log_fh, $sample);
my (@sample_list, %pop_ids, %pops, %grp_ids, %grps, %sample_ids);
parse_population_map(\@sample_list, \%pop_ids, \%pops, \%grp_ids, \%grps, \$genetic_map);
+parse_catalog_population_map(\@sample_list, \%pop_ids);
initialize_samples(\@parents, \@progeny, \@samples, \@sample_list, \%pop_ids, \%grp_ids);
@@ -161,7 +163,7 @@ sub execute_stacks {
next;
}
- $cmd = $exe_path . "ustacks -t $sample->{'fmt'} -f $sample->{'path'} -o $out_path -i $sample_id" . $minc;
+ $cmd = $exe_path . "ustacks -t $sample->{'fmt'} -f $sample->{'path'} -o $out_path" . $minc;
if ($sample->{'path'} !~ /$sample->{'file'}.$sample->{'suffix'}$/) {
# Guessing the sample name from the input path won't work.
@@ -290,7 +292,7 @@ sub execute_stacks {
}
#
- # Sort the reads according by catalog locus / run tsv2bam.
+ # Sort the reads according to catalog locus / run tsv2bam.
#
print STDERR "Sorting reads by RAD locus...\n";
print $log_fh "\ntsv2bam\n==========\n";
@@ -586,6 +588,44 @@ sub parse_population_map {
close($fh);
}
+sub parse_catalog_population_map {
+ my ($sample_list, $pop_ids) = @_;
+
+ my ($fh, @parts, $line, $sample);
+
+ return if (length($cpopmap_path) == 0);
+ return if ($genetic_map == true);
+
+ my $sample_cnt = 0;
+
+ open($fh, "<$cpopmap_path") or die("Unable to open catalog population map, '$cpopmap_path', $!\n");
+
+ while ($line = <$fh>) {
+ chomp $line;
+ next if ($line =~ /^\s*#/);
+
+ @parts = split(/\t/, $line);
+ if (scalar(@parts) != 2 and scalar(@parts) != 3) {
+ die("Unable to parse population map, '$cpopmap_path' (expected 2 or 3 columns, found " . scalar(@parts) . "); at line:\n$line\n");
+ }
+
+ foreach my $part (@parts) {
+ $part =~ s/^\s*|\s*$//g;
+ }
+
+ if (!defined($pop_ids->{$parts[0]})) {
+ print STDERR "Unable to find '$parts[0]' in '$popmap_path'. Samples listed in the catalog population map should also be present in the main population map.\n";
+ exit(1);
+ } else {
+ $sample_cnt++;
+ }
+ }
+
+ print STDERR " Parsed catalog population map: $sample_cnt of those ", scalar(@{$sample_list}), " files will be loaded into the catalog.\n";
+
+ close($fh);
+}
+
sub initialize_samples {
my ($parents, $progeny, $samples, $sample_list, $pop_ids, $grp_ids) = @_;
@@ -764,15 +804,9 @@ sub parse_command_line {
elsif ($_ =~ /^--resume$/) { $resume = true; }
elsif ($_ =~ /^--paired$/) { $paired = true; }
elsif ($_ =~ /^--samples$/) { $sample_path = shift @ARGV; }
- elsif ($_ =~ /^-O$/ || $_ =~ /^--popmap$/) {
- $popmap_path = shift @ARGV;
- push(@_cstacks, "-M " . $popmap_path);
- push(@_sstacks, "-M " . $popmap_path);
- push(@_tsv2bam, "-M " . $popmap_path);
- push(@_gstacks, "-M " . $popmap_path);
- push(@_populations, "-M " . $popmap_path);
-
- } elsif ($_ =~ /^-T$/ || $_ =~ /^--threads/) {
+ elsif ($_ =~ /^-O$/ || $_ =~ /^--popmap$/) { $popmap_path = shift @ARGV; }
+ elsif ($_ =~ /^--catalog-popmap$/) { $cpopmap_path = shift @ARGV; }
+ elsif ($_ =~ /^-T$/ || $_ =~ /^--threads/) {
$arg = shift @ARGV;
push(@_ustacks, "-p " . $arg);
push(@_cstacks, "-p " . $arg);
@@ -848,6 +882,18 @@ sub parse_command_line {
if (length($popmap_path) == 0) {
print STDERR "You must specify a population map that lists your sample names (--popmap).\n";
usage();
+ } elsif (length($cpopmap_path) > 0) {
+ push(@_cstacks, "-M " . $cpopmap_path);
+ push(@_sstacks, "-M " . $popmap_path);
+ push(@_tsv2bam, "-M " . $popmap_path);
+ push(@_gstacks, "-M " . $popmap_path);
+ push(@_populations, "-M " . $popmap_path);
+ } else {
+ push(@_cstacks, "-M " . $popmap_path);
+ push(@_sstacks, "-M " . $popmap_path);
+ push(@_tsv2bam, "-M " . $popmap_path);
+ push(@_gstacks, "-M " . $popmap_path);
+ push(@_populations, "-M " . $popmap_path);
}
if (length($sample_path) == 0) {
@@ -882,7 +928,7 @@ sub usage {
denovo_map.pl --samples dir --popmap path --out-path dir [--paired [--rm-pcr-duplicates]] (assembly options) (filtering options) [-X prog:"opts" ...]
Input/Output files:
- --samples: path to the directory containing the samples reads files.
+ --samples: path to the directory containing the reads files for each sample.
--popmap: path to a population map file (format is "<name> TAB <pop>", one sample per line).
-o,--out-path: path to an output directory.
@@ -908,7 +954,10 @@ denovo_map.pl --samples dir --popmap path --out-path dir [--paired [--rm-pcr-dup
Population filtering options:
-r,--min-samples-per-pop: minimum percentage of individuals in a population required to process a locus for that population (for populations; default: 0)
-p,--min-populations: minimum number of populations a locus must be present in to process a locus (for populations; default: 1)
-
+
+ For large datasets:
+ --catalog-popmap: path to a second population map file containing a subset of samples for use only in building the catalog.
+
Miscellaneous:
--time-components (for benchmarking)
EOQ
=====================================
scripts/stacks-integrate-alignments
=====================================
@@ -452,6 +452,7 @@ def write_catalog(loci, chrs, datestamp, new_locus_id, ordered_loci):
parts = line.split(' ')
loc_id = int(parts[0][1:])
ns = parts[1]
+ contig = ""
if len(parts) > 2:
contig = parts[2]
line = ""
=====================================
src/PopSum.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -106,13 +106,13 @@ LocPopSum::tally_fixed_pos(const CSLocus *cloc, Datum const*const* d, LocSum *s,
s->nucs[pos].reset();
for (uint i = start; i <= end; i++) {
- if (d[i] == NULL || pos >= d[i]->len)
+ if (d[i] == NULL || pos >= d[i]->len || d[i]->model[pos] == 'U')
continue;
//
// Before counting this individual, make sure the model definitively called this
- // position as hEterozygous or hOmozygous.
+ // position as hOmozygous.
//
- assert(d[i]->model[pos] != 'E');
+ assert(d[i]->model[pos] == 'O');
num_indv++;
p_nuc = cloc->con[pos];
}
@@ -471,7 +471,7 @@ LocPopSum::log_hwp_pr(double n_fac, double p_fac, double q_fac, double pp, doubl
int
LocPopSum::tally_metapop(const CSLocus *cloc)
{
- int variable_pop;
+ int16_t variable_pop;
uint16_t p_cnt, q_cnt, col;
LocSum **s = this->_per_pop;
LocTally *mp = this->_meta_pop;
@@ -553,7 +553,16 @@ LocPopSum::tally_metapop(const CSLocus *cloc)
if (s[j]->nucs[col].p_nuc() == mp->nucs[col].q_allele ||
s[j]->nucs[col].q_nuc() == mp->nucs[col].q_allele)
variable_pop = j;
- }
+ } else if (p_cnt == 1 && q_cnt == 1) {
+ //
+ // This is a special case where there are only two populations
+ // with fixed, different alleles. Here both alleles are private
+ // but we can't store both populations, so we will store the max
+ // integer value instead.
+ //
+ if (mp->nucs[col].pop_cnt == 2)
+ variable_pop = two_priv_alleles;
+ }
mp->nucs[col].priv_allele = variable_pop;
}
@@ -566,13 +575,17 @@ LocPopSum::tally_ref_alleles(int snp_index, uint16_t &allele_cnt,
uint16_t &p_cnt, uint16_t &q_cnt)
{
int nucs[4] = {0};
- char nuc[2];
+ char nuc[2] = {0};
p_allele = 0;
q_allele = 0;
allele_cnt = 0;
for (uint j = 0; j < this->_pop_cnt; j++) {
+
+ if (this->_per_pop[j]->nucs[snp_index].num_indv() == 0)
+ continue;
+
nuc[0] = 0;
nuc[1] = 0;
nuc[0] = this->_per_pop[j]->nucs[snp_index].p_nuc();
@@ -614,6 +627,12 @@ LocPopSum::tally_ref_alleles(int snp_index, uint16_t &allele_cnt,
return 0;
}
+ if (allele_cnt == 0) {
+ p_allele = 0;
+ q_allele = 0;
+ return 1;
+ }
+
//
// Record which nucleotide is the P allele and which is the Q allele.
//
@@ -661,6 +680,10 @@ LocPopSum::tally_ref_alleles(int snp_index, uint16_t &allele_cnt,
q_cnt = 0;
for (uint j = 0; j < this->_pop_cnt; j++) {
+
+ if (this->_per_pop[j]->nucs[snp_index].num_indv() == 0)
+ continue;
+
nuc[0] = 0;
nuc[1] = 0;
nuc[0] = this->_per_pop[j]->nucs[snp_index].p_nuc();
@@ -765,9 +788,11 @@ LocPopSum::haplotype_diversity(int start, int end, Datum const*const* d)
//
// If this haplotype is fixed, don't calculate any statistics.
//
- if (n == 0)
+ if (n == 0) {
+ delete lstat;
return NULL;
-
+ }
+
//
// Store a summary of the haplotype counts to output below.
//
=====================================
src/PopSum.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
=====================================
src/Seq.h
=====================================
@@ -144,7 +144,7 @@ inline
void Seq::reserve(size_t n, bool with_qual) {
if (capacity < long(n)) {
capacity = n;
- delete seq;
+ delete [] seq;
seq = new char[capacity + 1];
*seq = '\0';
if (with_qual) {
=====================================
src/Vcf.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -372,6 +372,7 @@ VcfParser::read_header()
// Skip header lines missing the '='.
continue;
}
+
header_.add_meta(VcfMeta(string(line+2, equal), string(equal+1)));
}
if (!not_eof)
=====================================
src/Vcf.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
=====================================
src/clean.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at uoregon.edu>
//
// This file is part of Stacks.
//
@@ -396,6 +396,7 @@ process_barcode(RawRead *href_1, RawRead *href_2, BarcodePair &bc,
barcode_log[bc]["total"] = 0;
barcode_log[bc]["low_qual"] = 0;
barcode_log[bc]["retained"] = 0;
+ barcode_log[bc]["proppair"] = 0;
}
barcode_log[bc]["total"] += paired ? 2 : 1;
=====================================
src/constants.h
=====================================
@@ -159,6 +159,12 @@ const unsigned int max_samples = max_alleles / 2;
//
const double FSNC = -7.0;
+//
+// A flag to store the special case when we are searching for private alleles
+// and find two, alternative private alleles at the same locus.
+//
+const int16_t two_priv_alleles = -2;
+
//
// Supported file types
//
=====================================
src/cstacks.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -42,6 +42,7 @@ double min_match_len = 0.80;
double max_gaps = 2.0;
size_t next_catalog_id = 1;
+size_t next_sample_id = 1;
using namespace std;
@@ -71,14 +72,13 @@ int main (int argc, char* argv[]) {
map<int, CLocus *> catalog;
pair<int, string> s;
- bool compressed = false;
- int i;
+ int i;
- set<int> seen_sample_ids; // For checking sample ID unicity.
+ map<string, size_t> seen_samples; // For checking sample ID unicity.
if (catalog_path.length() > 0) {
cerr << "\nInitializing existing catalog...\n";
- if (!initialize_existing_catalog(catalog_path, catalog)) {
+ if (!initialize_existing_catalog(catalog_path, catalog, seen_samples, next_sample_id)) {
cerr << "Error: Failed to initialize the catalog.\n";
throw exception();
}
@@ -89,12 +89,11 @@ int main (int argc, char* argv[]) {
samples.pop();
cerr << "\nInitializing new catalog...\n";
- if (!initialize_new_catalog(s, catalog)) {
+ if (!initialize_new_catalog(s, catalog, seen_samples, next_sample_id)) {
cerr << "Error: Failed to initialize the catalog.\n";
throw exception();
}
i = 2;
- seen_sample_ids.insert(catalog.begin()->second->sample_id);
}
while (!samples.empty()) {
@@ -105,6 +104,7 @@ int main (int argc, char* argv[]) {
cerr << "\nProcessing sample " << s.second << " [" << i << " of " << sample_cnt << "]\n";
+ bool compressed = false;
if (!load_loci(s.second, sample, 0, false, compressed)) {
cerr << "Failed to load sample " << i << "\n";
continue;
@@ -112,15 +112,18 @@ int main (int argc, char* argv[]) {
//
// Assign the ID for this sample data.
//
- s.first = sample.begin()->second->sample_id;
+ s.first = next_sample_id;
- // Check for unique sample IDs.
- if (!seen_sample_ids.insert(s.first).second) {
+ // Check for unique sample names.
+ if (seen_samples.count(s.second) > 0) {
// Insert failed.
- cerr << "Error: Sample ID '" << s.first << "' occurs more than once. Sample IDs must be unique." << endl;
+ cerr << "Error: Sample '" << s.second << "' is already inserted into this catalog. Samples must be unique." << endl;
throw exception();
- }
-
+ } else {
+ seen_samples[s.second] = next_sample_id;
+ }
+ next_sample_id++;
+
//dump_loci(sample);
cerr << "Searching for sequence matches...\n";
@@ -163,7 +166,7 @@ int main (int argc, char* argv[]) {
cerr << "\nWriting catalog in directory '" << out_path << "'.\n"
<< "Final catalog contains " << catalog.size() << " loci.\n";
- write_catalog(catalog);
+ write_catalog(catalog, seen_samples);
cerr << "cstacks is done.\n";
//
@@ -177,27 +180,6 @@ int main (int argc, char* argv[]) {
IF_NDEBUG_CATCH_ALL_EXCEPTIONS
}
-int update_catalog_index(map<int, CLocus *> &catalog, map<string, int> &cat_index) {
- map<int, CLocus *>::iterator j;
- char id[id_len];
-
- for (j = catalog.begin(); j != catalog.end(); j++) {
- snprintf(id, id_len - 1, "%s|%d|%c",
- j->second->loc.chr(),
- j->second->loc.bp,
- j->second->loc.strand == strand_plus ? '+' : '-');
-
- if (cat_index.count(id) == 0) {
- cat_index[id] = j->first;
- } else {
- if (cat_index[id] != j->first)
- cerr << "Error: Catalog index mismatch, key: '" << id << "'.\n";
- }
- }
-
- return 0;
-}
-
int
characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
{
@@ -1042,8 +1024,9 @@ int find_matches_by_genomic_loc(map<string, int> &cat_index, map<int, QLocus *>
return 0;
}
-int write_catalog(map<int, CLocus *> &catalog) {
-
+int
+write_catalog(map<int, CLocus *> &catalog, map<string, size_t> &seen_samples)
+{
map<int, CLocus *>::iterator i;
CLocus *tag;
set<int> matches;
@@ -1053,18 +1036,24 @@ int write_catalog(map<int, CLocus *> &catalog) {
string tag_file = out_path + "catalog.tags.tsv";
string snp_file = out_path + "catalog.snps.tsv";
string all_file = out_path + "catalog.alleles.tsv";
-
+ string sam_file = out_path + "catalog.sample_list.tsv";
+
if (gzip) {
tag_file += ".gz";
snp_file += ".gz";
all_file += ".gz";
+ sam_file += ".gz";
}
+ //
+ // Write the sample list.
+ //
+
//
// Open the output files for writing.
//
- gzFile gz_tags, gz_snps, gz_alle;
- ofstream tags, snps, alle;
+ gzFile gz_tags, gz_snps, gz_alle, gz_sams;
+ ofstream tags, snps, alle, sams;
if (gzip) {
gz_tags = gzopen(tag_file.c_str(), "wb");
if (!gz_tags) {
@@ -1090,13 +1079,24 @@ int write_catalog(map<int, CLocus *> &catalog) {
#if ZLIB_VERNUM >= 0x1240
gzbuffer(gz_alle, libz_buffer_size);
#endif
+
+ gz_sams = gzopen(sam_file.c_str(), "wb");
+ if (!gz_sams) {
+ cerr << "Error: Unable to open gzipped catalog sample list file '" << sam_file << "': " << strerror(errno) << ".\n";
+ exit(1);
+ }
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_sams, libz_buffer_size);
+ #endif
} else {
tags.open(tag_file.c_str());
snps.open(snp_file.c_str());
alle.open(all_file.c_str());
+ sams.open(sam_file.c_str());
check_open(tags, tag_file);
check_open(snps, snp_file);
check_open(alle, all_file);
+ check_open(sams, sam_file);
}
//
@@ -1116,10 +1116,12 @@ int write_catalog(map<int, CLocus *> &catalog) {
gzputs_throwing(gz_tags, log.str().c_str());
gzputs_throwing(gz_snps, log.str().c_str());
gzputs_throwing(gz_alle, log.str().c_str());
+ gzputs_throwing(gz_sams, log.str().c_str());
} else {
tags << log.str();
snps << log.str();
alle << log.str();
+ sams << log.str();
}
for (i = catalog.begin(); i != catalog.end(); i++) {
@@ -1131,6 +1133,31 @@ int write_catalog(map<int, CLocus *> &catalog) {
write_simple_output(tag, tags, snps, alle);
}
+ //
+ // Write the list of samples incorporated into the catalog, sorted by sample ID.
+ //
+ vector<pair<string, size_t>> sorted_seen_samples;
+ for (auto it = seen_samples.begin(); it != seen_samples.end(); it++) {
+ string sample_path = it->first;
+ // Remove the file path off the sample string.
+ int pos = sample_path.find_last_of("/");
+ sample_path = sample_path.substr(pos+1);
+ sorted_seen_samples.push_back(make_pair(sample_path, it->second));
+ }
+ sort(sorted_seen_samples.begin(), sorted_seen_samples.end(),
+ [] (pair<string, size_t> i, pair<string, size_t> j) { return (i.second < j.second); });
+
+ for (auto it = sorted_seen_samples.begin(); it != sorted_seen_samples.end(); it++) {
+ log.str("");
+ log << it->first << "\t" << it->second << "\n";
+
+ if (gzip) {
+ gzputs_throwing(gz_sams, log.str().c_str());
+ } else {
+ sams << log.str();
+ }
+ }
+
time(&rawtime);
timeinfo = localtime(&rawtime);
strftime(date, 32, "%F %T", timeinfo);
@@ -1150,10 +1177,12 @@ int write_catalog(map<int, CLocus *> &catalog) {
gzclose_throwing(gz_tags);
gzclose_throwing(gz_snps);
gzclose_throwing(gz_alle);
+ gzclose_throwing(gz_sams);
} else {
tags.close();
snps.close();
alle.close();
+ sams.close();
}
return 0;
@@ -1613,8 +1642,7 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
}
sources = sources.substr(0, sources.length() - 1);
- cat_file << 0 << "\t" // Catalog has no sample ID.
- << tag->id << "\t"
+ cat_file << tag->id << "\t"
<< "consensus" << "\t"
<< "0" << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
<< sources << "\t"
@@ -1627,8 +1655,7 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
// Output the SNPs associated with the catalog tag
//
for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
- snp_file << "0" << "\t"
- << tag->id << "\t"
+ snp_file << tag->id << "\t"
<< (*snp_it)->col << "\t";
switch((*snp_it)->type) {
@@ -1655,8 +1682,7 @@ write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstrea
// Output the alleles associated with the two matched tags
//
for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
- all_file << "0" << "\t"
- << tag->id << "\t"
+ all_file << tag->id << "\t"
<< all_it->first << "\t"
<< "0" << "\t" // These two fields are used in the
<< "0" << "\n"; // ustacks/pstacks output, not in cstacks.
@@ -1678,8 +1704,7 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
sstr.str("");
- sstr << 0 << "\t" // Catalog has no sample ID.
- << tag->id << "\t"
+ sstr << tag->id << "\t"
<< "consensus" << "\t"
<< "0" << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
<< sources << "\t"
@@ -1695,8 +1720,7 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
// Output the SNPs associated with the catalog tag
//
for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
- sstr << "0" << "\t"
- << tag->id << "\t"
+ sstr << tag->id << "\t"
<< (*snp_it)->col << "\t";
switch((*snp_it)->type) {
@@ -1726,8 +1750,7 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
// Output the alleles associated with the two matched tags
//
for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
- sstr << "0" << "\t"
- << tag->id << "\t"
+ sstr << tag->id << "\t"
<< all_it->first << "\t"
<< 0 << "\t"
<< 0 << "\n";
@@ -1738,7 +1761,8 @@ write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_f
}
int
-initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
+initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog,
+ map<string, size_t> &seen_samples, size_t &next_sample_id)
{
map<int, CLocus *> tmp_catalog;
bool compressed = false;
@@ -1751,7 +1775,7 @@ initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
- sample.first = tmp_catalog.begin()->second->sample_id;
+ sample.first = next_sample_id;
//
// Iterate over the catalog entires and renumber them after recording the source of
@@ -1768,11 +1792,15 @@ initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
cerr << " " << catalog.size() << " loci were newly added to the catalog.\n";
+ seen_samples[sample.second] = next_sample_id;
+ next_sample_id++;
+
return 1;
}
int
-initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
+initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog,
+ map<string, size_t> &seen_samples, size_t &next_sample_id)
{
bool compressed;
@@ -1823,6 +1851,13 @@ initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
next_catalog_id = max_catalog_id + 1;
+ //
+ // Parse the list of samples in the existing catalog.
+ //
+ if (!load_catalog_samples(catalog_path, seen_samples, next_sample_id))
+ throw exception();
+ next_sample_id++;
+
return 1;
}
=====================================
src/cstacks.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2018, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -58,8 +58,8 @@ using std::queue;
void help( void );
void version( void );
int parse_command_line(int, char**);
-int initialize_new_catalog(pair<int, string> &, map<int, CLocus *> &);
-int initialize_existing_catalog(string, map<int, CLocus *> &);
+int initialize_new_catalog(pair<int, string> &, map<int, CLocus *> &, map<string, size_t> &, size_t &);
+int initialize_existing_catalog(string, map<int, CLocus *> &, map<string, size_t> &, size_t &);
int update_catalog_index(map<int, CLocus *> &, map<string, int> &);
int merge_catalog_loci(map<int, CLocus *> &, vector<int> &);
int find_kmer_matches_by_sequence(map<int, CLocus *> &, map<int, QLocus *> &, int);
@@ -72,7 +72,7 @@ int merge_matches(map<int, CLocus *> &, map<int, QLocus *> &, pair<int, string>
int add_unique_tag(pair<int, string> &, map<int, CLocus *> &, QLocus *);
bool compare_dist(pair<int, int>, pair<int, int>);
-int write_catalog(map<int, CLocus *> &);
+int write_catalog(map<int, CLocus *> &, map<string, size_t> &);
int write_simple_output(CLocus *, ofstream &, ofstream &, ofstream &);
int write_gzip_output(CLocus *, gzFile &, gzFile &, gzFile &);
=====================================
src/export_formats.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2012-2022, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2012-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -283,7 +283,9 @@ SumstatsExport::write_batch(const vector<LocBin *> &loci)
<< s->nucs[pos].smoothed(1) << "\t" // Smoothed Fis
<< s->nucs[pos].bs(1) << "\t" // Fis bootstrapped p-value.
<< s->nucs[pos].stat(2) << "\t"; // HWE p-value.
- (t->nucs[pos].priv_allele == (int) pop) ? this->_fh << "1\n" : this->_fh << "0\n";
+ //t->nucs[pos].priv_allele == (int) pop ? this->_fh << "1\n" : this->_fh << "0\n";
+ (t->nucs[pos].priv_allele == (int) pop ||
+ t->nucs[pos].priv_allele == two_priv_alleles) ? this->_fh << "1\n" : this->_fh << "0\n";
}
}
}
@@ -2646,11 +2648,17 @@ VcfExport::write_site(const CSLocus* cloc,
// DP.
sample << ":" << d[s]->snpdata[index].tot_depth;
// AD.
- if (d[s]->snpdata[index].nt_depths.sum() > 0)
+ if (d[s]->snpdata[index].nt_depths.sum() > 0) {
sample << ":" << d[s]->snpdata[index].nt_depths[Nt2(ref)]
<< "," << d[s]->snpdata[index].nt_depths[Nt2(alt)];
- else
- sample << ":.";
+ } else {
+ //
+ // If we arrive here, there are no nucleotides present for this individual at this
+ // site, which means either there are Ns here or a nucleotide that was not part of
+ // the genotype call and was discarded.
+ //
+ sample << ":0,0";
+ }
// GQ.
assert(d[s]->snpdata[index].gq != UINT8_MAX);
sample << ':' << int(d[s]->snpdata[index].gq);
=====================================
src/populations.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2012-2022, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2012-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -161,6 +161,11 @@ try {
<< in_path << "' as a single popultaion.\n";
}
+ if (bs_wl_file.length() > 0) {
+ load_marker_list(bs_wl_file, bootstraplist);
+ cerr << "Loaded " << bootstraplist.size() << " markers to include when bootstrapping.\n";
+ }
+
//
// Locate and open input files, read VCF headers, parse population map, load black/white lists.
//
@@ -806,6 +811,34 @@ BatchLocusProcessor::init_external_loci(string in_path, string pmap_path)
cout << "Opening the VCF file...\n";
this->_vcf_parser.open(in_path);
+ //
+ // If this VCF input file contained contig definitions, move them to the popmap for later export.
+ //
+ const vector<VcfMeta>& vm = this->_vcf_parser.header().meta();
+ for (auto &vmit: vm) {
+ if (vmit.key() == "contig") {
+ //
+ // Parse a string that looks like: <ID=NW_024121099.1,length=59790976>
+ //
+ const char *p, *q;
+ for (p = vmit.value().c_str(); *p != '\0' && *p != '='; p++);
+ for (q = p + 1; *q != '\0' && *q != ','; q++);
+ if (*p == '\0' || *q == '\0') {
+ cerr << "Error parsing VCF header line: ##" << vmit.key() << "=" << vmit.value() << "\n";
+ throw exception();
+ }
+ string key = string(p+1, q);
+ for (p = q + 1; *p != '\0' && *p != '='; p++);
+ for (q = p + 1; *q != '\0' && *q != '>'; q++);
+ if (*p == '\0' || *q == '\0') {
+ cerr << "Error parsing VCF header line: ##" << vmit.key() << "=" << vmit.value() << "\n";
+ throw exception();
+ }
+ string val = string(p+1, q);
+ this->_mpopi->add_contig(key, val);
+ }
+ }
+
if (this->_vcf_parser.header().samples().empty()) {
cerr << "Error: No samples in VCF file '" << in_path << "'.\n";
throw exception();
@@ -1084,13 +1117,13 @@ LocusFilter::erase_snp(CSLocus *cloc, Datum **d, size_t n_samples, size_t snp_in
assert(d[s]->snpdata.size() == cloc->snps.size());
//
- // Correct the model calls.
+ // Update the model calls.
//
char& m = d[s]->model[col];
assert(m == 'O' || m == 'E' || m == 'U');
if (m == 'E' || (m == 'O' && d[s]->obshap[0][snp_index] != cloc->con[col]))
m = 'U';
- //
+ //
// Erase the nucleotide in the haplotypes.
//
for (char* hap : d[s]->obshap) {
@@ -1241,6 +1274,7 @@ LocusFilter::apply_filters_stacks(LocBin& loc, ostream& log_fh, const MetaPopInf
//
loc.s = new LocPopSum(strlen(loc.cloc->con), mpopi);
this->filter_snps(loc, mpopi, log_fh);
+ this->filter_fixed_sites(loc, mpopi, log_fh);
//
// If write_single_snp, write_random_snp or filter_haplotype_wise has been specified,
@@ -1483,7 +1517,7 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
// If the site is fixed, ignore it.
//
NucTally& nuct = t->nucs[col];
- if (t->nucs[col].fixed == true)
+ if (nuct.fixed == true)
continue;
size_t n_pruned_pops = 0;
@@ -1512,7 +1546,7 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
else if ((double) n_samples_left / mpopi.n_samples() < min_samples_overall)
overall_sample_prune = true;
- if (t->nucs[col].allele_cnt > 1) {
+ if (nuct.allele_cnt > 1) {
//
// Test for minor allele frequency.
//
@@ -1522,7 +1556,7 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
//
// Test for observed heterozygosity.
//
- if (t->nucs[col].obs_het > max_obs_het)
+ if (nuct.obs_het > max_obs_het)
het_prune = true;
}
@@ -1550,6 +1584,77 @@ LocusFilter::filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
}
}
+void
+LocusFilter::filter_fixed_sites(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
+{
+ assert(loc.s != NULL);
+ CSLocus *cloc = loc.cloc;
+ Datum **d = loc.d;
+ LocPopSum *s = loc.s;
+
+ loc.s->sum_pops(loc.cloc, loc.d, mpopi, verbose, cout);
+ loc.s->tally_metapop(loc.cloc);
+ size_t snp_index = 0;
+
+ for (uint col = 0; col < cloc->len; col++) {
+ //
+ // If the site is polymorphic, ignore it.
+ //
+ if (snp_index < cloc->snps.size() && cloc->snps[snp_index]->col == col) {
+ snp_index++;
+ continue;
+ }
+
+ size_t n_pruned_pops = 0;
+ size_t n_samples_left = 0;
+
+ for (size_t p = 0; p < s->pop_cnt(); ++p) {
+ const LocSum* sum = s->per_pop(p);
+
+ if ((double) sum->nucs[col].num_indv() / this->_pop_tot[p] >= min_samples_per_pop) {
+ n_samples_left += sum->nucs[col].num_indv();
+ } else {
+ ++n_pruned_pops;
+
+ const Pop& pop = mpopi.pops()[p];
+ for (uint s = pop.first_sample; s <= pop.last_sample; ++s) {
+ if (d[s] == NULL || col >= (uint) d[s]->len)
+ continue;
+
+ assert(d[s]->model);
+ //
+ // Update the model calls.
+ //
+ d[s]->model[col] = 'U';
+ }
+ }
+ }
+
+ bool sample_prune = false;
+ bool overall_sample_prune = false;
+
+ if (mpopi.pops().size() - n_pruned_pops < (uint) min_populations)
+ sample_prune = true;
+ else if ((double) n_samples_left / mpopi.n_samples() < min_samples_overall)
+ overall_sample_prune = true;
+
+ if (sample_prune || overall_sample_prune) {
+ this->_filtered_sites++;
+
+ for (size_t s = 0; s < mpopi.n_samples(); ++s) {
+ if (d[s] == NULL || col >= (uint) d[s]->len)
+ continue;
+
+ assert(d[s]->model);
+ //
+ // Update the model calls.
+ //
+ d[s]->model[col] = 'U';
+ }
+ }
+ }
+}
+
void
LocusFilter::filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh)
{
@@ -2666,6 +2771,15 @@ SumStatsSummary::accumulate(const vector<LocBin *> &loci)
//
if (t->nucs[pos].priv_allele >= 0)
_private_cnt[t->nucs[pos].priv_allele]++;
+
+ else if (t->nucs[pos].priv_allele == two_priv_alleles) {
+ // If there are two private alleles at this position, find the two populations they occurred in.
+ for (uint pop = 0; pop < this->_pop_cnt; pop++) {
+ s = loci[i]->s->per_pop(pop);
+ if (s->nucs[pos].num_indv() > 0)
+ _private_cnt[pop]++;
+ }
+ }
if (t->nucs[pos].allele_cnt == 2) {
site_cnt++;
=====================================
src/populations.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2012-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2012-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -254,6 +254,7 @@ public:
bool filter(const MetaPopInfo *mpopi, Datum **d, CSLocus *cloc);
void filter_snps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
void filter_haps(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
+ void filter_fixed_sites(LocBin& loc, const MetaPopInfo& mpopi, ostream &log_fh);
void gt_depth_filter(Datum **d, const CSLocus *cloc);
void keep_single_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const;
void keep_random_snp(CSLocus* cloc, Datum** d, size_t n_samples, const LocTally* t) const;
=====================================
src/process_radtags.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2022, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -319,7 +319,7 @@ initialize_counters(map<string, long> &counter)
counter["retained"] = 0;
counter["recovered"] = 0;
counter["transposed"] = 0;
-
+ counter["proppair"] = 0;
return 0;
}
@@ -339,6 +339,7 @@ consolidate_counters(map<string, long> &counter,
counter["retained"] += cntr["retained"];
counter["recovered"] += cntr["recovered"];
counter["transposed"] += cntr["transposed"];
+ counter["proppair"] += cntr["proppair"];
}
return 0;
@@ -358,11 +359,13 @@ consolidate_barcode_logs(map<BarcodePair, map<string, long>> &barcode_log,
barcode_log[bc]["total"] = 0;
barcode_log[bc]["low_qual"] = 0;
barcode_log[bc]["retained"] = 0;
+ barcode_log[bc]["proppair"] = 0;
}
barcode_log[bc]["noradtag"] += it->second["noradtag"];
barcode_log[bc]["total"] += it->second["total"];
barcode_log[bc]["low_qual"] += it->second["low_qual"];
barcode_log[bc]["retained"] += it->second["retained"];
+ barcode_log[bc]["proppair"] += it->second["proppair"];
}
}
@@ -534,6 +537,9 @@ process_paired_reads(string prefix_1,
int result_2 = 1;
if (r_1->retain && r_2->retain) {
+ counter["proppair"]++;
+ barcode_log[bc]["proppair"]++;
+
if (retain_header) {
result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
write_fastq(pair_1_fhs[bc], s_1, r_1) :
@@ -1021,6 +1027,11 @@ process_paired_reads_parallel(string prefix_1,
if (r_2->retain)
process_singlet(r_2, renz_2, true, blog[*bc], cntr);
+ if (r_1->retain && r_2->retain) {
+ cntr["proppair"]++;
+ blog[*bc]["proppair"]++;
+ }
+
//
// Store the result in the output buffer for writing.
//
@@ -1911,6 +1922,7 @@ print_results(int argc, char **argv,
c["ambiguous"] = 0;
c["noradtag"] = 0;
c["transposed"] = 0;
+ c["proppair"] = 0;
//
// Total up the individual counters
@@ -1924,6 +1936,7 @@ print_results(int argc, char **argv,
c["noradtag"] += it->second["noradtag"];
c["retained"] += it->second["retained"];
c["transposed"] += it->second["transposed"];
+ c["proppair"] += it->second["proppair"];
}
auto print_nreads = [&c] (long n, const string& legend) {
@@ -1970,8 +1983,10 @@ print_results(int argc, char **argv,
log << "Barcode Not Found\t" << c["ambiguous"] << "\t" << as_percentage((double) c["ambiguous"] / c["total"]) << "\n"
<< "Low Quality\t" << c["low_quality"] << "\t" << as_percentage((double) c["low_quality"] / c["total"]) << "\n"
<< "RAD Cutsite Not Found\t" << c["noradtag"] << "\t" << as_percentage((double) c["noradtag"] / c["total"]) << "\n"
- << "Retained Reads\t" << c["retained"] << "\t" << as_percentage((double) c["retained"] / c["total"]) << "\n"
- << "END total_raw_read_counts" << "\n";
+ << "Retained Reads\t" << c["retained"] << "\t" << as_percentage((double) c["retained"] / c["total"]) << "\n";
+ if (paired)
+ log << "Properly Paired\t" << c["proppair"] << "\t" << as_percentage(((double) c["proppair"]*2.0) / c["total"]) << "\n";
+ log << "END total_raw_read_counts" << "\n";
if (max_bc_size_1 == 0) return 0;
@@ -1997,8 +2012,11 @@ print_results(int argc, char **argv,
<< "RAD Cutsite Not Found\t"
<< "Low Quality\t"
<< "Retained Reads\t"
- << "Pct Retained\t"
- << "Pct Total Reads\n";
+ << "Pct Retained\t";
+ if (paired)
+ log << "Properly Paired\t"
+ << "Pct Properly Paired\t";
+ log << "Pct Total Reads\n";
set<BarcodePair> barcode_list;
@@ -2011,13 +2029,17 @@ print_results(int argc, char **argv,
if (barcode_log.count(barcodes[i]) == 0)
log << "0\t" << "0\t" << "0\t" << "0\n";
- else
+ else {
log << barcode_log[barcodes[i]]["total"] << "\t"
<< barcode_log[barcodes[i]]["noradtag"] << "\t"
<< barcode_log[barcodes[i]]["low_qual"] << "\t"
<< barcode_log[barcodes[i]]["retained"] << "\t"
- << as_percentage((double) barcode_log[barcodes[i]]["retained"] / barcode_log[barcodes[i]]["total"]) << "\t"
- << as_percentage((double) barcode_log[barcodes[i]]["retained"] / c["total"]) << "\n";
+ << as_percentage((double) barcode_log[barcodes[i]]["retained"] / barcode_log[barcodes[i]]["total"]) << "\t";
+ if (paired)
+ log << barcode_log[barcodes[i]]["proppair"] << "\t"
+ << as_percentage(((double) barcode_log[barcodes[i]]["proppair"]*2.0) / barcode_log[barcodes[i]]["total"]) << "\t";
+ log << as_percentage((double) barcode_log[barcodes[i]]["retained"] / c["total"]) << "\n";
+ }
}
log << "END per_barcode_raw_read_counts" << "\n"
=====================================
src/renz.cc
=====================================
@@ -8,6 +8,8 @@ using namespace std;
//
const char *aciI[] = {"CGC", "CGG", // C/CGC, AciI
"GCG", "CCG"};
+const char *aclI[] = {"CGTT", // AA/CGTT, AclI
+ "GCAA"};
const char *ageI[] = {"CCGGT", // A/CCGGT, AgeI
"ACCGG"};
const char *aluI[] = {"CT", // AG/CT, AluI
@@ -56,6 +58,8 @@ const char *ecoRV[] = {"ATC", // GAT/ATC, EcoRV
"GAT"};
const char *ecoT22I[] = {"TGCAT", // A/TGCAT, EcoT22I
"ATGCA"};
+const char *haeII[] = {"GCGCA", "GCGCG", // RGCGC/Y, HaeII
+ "TGCGC", "CGCGC"};
const char *haeIII[] = {"CC", // GG/CC, HaeIII
"GG"};
const char *hhaI[] = {"GCG", // GCG/C, HhaI
@@ -150,6 +154,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz["sgrAI"] = sgrAI; // CR/CCGGYG, SgrAI; R=A or G; Y=C or T
renz["apeKI"] = apeKI; // G/CWGC, ApeKI; W=A or T
renz["hindIII"] = hindIII; // A/AGCTT, HindIII
+ renz["haeII"] = haeII; // RGCGC/Y, HaeII
renz["haeIII"] = haeIII; // GG/CC, HaeIII
renz["dpnII"] = dpnII; // GATC, DpnII
renz["sphI"] = sphI; // GCATG/C, SphI
@@ -183,6 +188,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz["ageI"] = ageI; // A/CCGGT, AgeI
renz["rsaI"] = rsaI; // GT/AC, RsaI
renz["aciI"] = aciI; // C/CGC, AciI
+ renz["aclI"] = aclI; // AA/CGTT, AclI
renz["bfaI"] = bfaI; // C/TAG, BfaI
renz["bfuCI"] = bfuCI; // /GATC, BfuCI
renz["aseI"] = aseI; // AT/TAAT, AseI
@@ -210,6 +216,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_cnt["sgrAI"] = 2;
renz_cnt["apeKI"] = 2;
renz_cnt["hindIII"] = 1;
+ renz_cnt["haeII"] = 2;
renz_cnt["haeIII"] = 1;
renz_cnt["dpnII"] = 1;
renz_cnt["sphI"] = 1;
@@ -243,6 +250,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_cnt["ageI"] = 1;
renz_cnt["rsaI"] = 1;
renz_cnt["aciI"] = 2;
+ renz_cnt["aclI"] = 1;
renz_cnt["bfaI"] = 1;
renz_cnt["bfuCI"] = 1;
renz_cnt["aseI"] = 1;
@@ -260,7 +268,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_cnt["pspXI"] = 3;
renz_cnt["hpyCH4IV"] = 1;
renz_cnt["ngoMIV"] = 1;
-
+
renz_len["sbfI"] = 6;
renz_len["pstI"] = 5;
renz_len["pstIshi"] = 4;
@@ -271,6 +279,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_len["apeKI"] = 4;
renz_len["hindIII"] = 5;
renz_len["haeIII"] = 2;
+ renz_len["haeII"] = 5;
renz_len["dpnII"] = 4;
renz_len["sphI"] = 5;
renz_len["nlaIII"] = 4;
@@ -303,6 +312,7 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
renz_len["ageI"] = 5;
renz_len["rsaI"] = 2;
renz_len["aciI"] = 3;
+ renz_len["aclI"] = 4;
renz_len["bfaI"] = 3;
renz_len["bfuCI"] = 4;
renz_len["aseI"] = 4;
=====================================
src/sql_utilities.cc
=====================================
@@ -2,7 +2,9 @@
using namespace std;
-void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verbose) {
+void
+load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verbose)
+{
CatMatch *m;
string f;
vector<string> parts;
@@ -54,8 +56,8 @@ void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verb
throw exception();
}
- char c = parts[3].at(0);
- if (parts[3] != "consensus" && c != 'A' && c != 'C' && c != 'G' && c != 'T') {
+ char c = parts[2].at(0);
+ if (parts[2] != "consensus" && c != 'A' && c != 'C' && c != 'G' && c != 'T') {
// This sample locus was blacklisted, because:
// "multi": it matches multiple c-loci
// "extra_snp": it has a SNP unknown to the catalog
@@ -66,13 +68,13 @@ void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verb
m = new CatMatch;
m->cat_id = atoi(parts[0].c_str());
- m->sample_id = atoi(parts[1].c_str());
- m->tag_id = atoi(parts[2].c_str());
- m->haplotype = new char[parts[3].length() + 1];
- strcpy(m->haplotype, parts[3].c_str());
- m->depth = atoi(parts[4].c_str());
- m->cigar = new char[parts[5].length() + 1];
- strcpy(m->cigar, parts[5].c_str());
+ ////m->sample_id = atoi(parts[1].c_str());
+ m->tag_id = atoi(parts[1].c_str());
+ m->haplotype = new char[parts[2].length() + 1];
+ strcpy(m->haplotype, parts[2].c_str());
+ m->depth = atoi(parts[3].c_str());
+ m->cigar = new char[parts[4].length() + 1];
+ strcpy(m->cigar, parts[4].c_str());
matches.push_back(m);
}
@@ -86,7 +88,86 @@ void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verb
return;
}
-int load_model_results(string sample, map<int, ModRes *> &modres) {
+int
+load_catalog_samples(string path, map<string, size_t> &seen_samples, size_t &max_sample_id)
+{
+ string f;
+ vector<string> parts;
+ long int line_num;
+ ifstream fh;
+ gzFile gz_fh;
+
+ char *line = (char *) malloc(sizeof(char) * max_len);
+ int size = max_len;
+ int cnt = 0;
+ bool gzip = false;
+ int fh_status = 1;
+
+ f = path + ".sample_list.tsv";
+ fh.open(f.c_str(), ifstream::in);
+ if (fh.fail()) {
+ //
+ // Test for a gzipped file.
+ //
+ f = path + ".sample_list.tsv.gz";
+ gz_fh = gzopen(f.c_str(), "rb");
+ if (!gz_fh)
+ return -1;
+
+ #if ZLIB_VERNUM >= 0x1240
+ gzbuffer(gz_fh, libz_buffer_size);
+ #endif
+ gzip = true;
+ }
+
+ max_sample_id = 0;
+ line_num = 1;
+
+ while (fh_status) {
+ fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
+ line_num++;
+
+ if (!fh_status && strlen(line) == 0)
+ continue;
+
+ if (is_comment(line)) continue;
+
+ parse_tsv(line, parts);
+
+ cnt = parts.size();
+
+ if (cnt != num_samlist_fields) {
+ cerr << "Error parsing '" << f.c_str() << "' at line: " << line_num << ". (" << parts.size() << " fields).\n";
+ throw exception();
+ }
+
+ int sample_id = is_integer(parts[1].c_str());
+ if (sample_id > max_sample_id)
+ max_sample_id = sample_id;
+
+ string sample_name = parts[0];
+ if (seen_samples.count(sample_name) > 0) {
+ // Insert failed.
+ cerr << "Error: Sample '" << sample_name << "' appears more than one time in this catalog. Samples must be unique." << endl;
+ throw exception();
+ } else {
+ seen_samples[sample_name] = sample_id;
+ }
+ }
+
+ if (gzip)
+ gzclose(gz_fh);
+ else
+ fh.close();
+ free(line);
+
+ return 1;
+}
+
+int
+load_model_results(string sample, map<int, ModRes *> &modres)
+{
+ //// Deprecated
string f;
vector<string> parts;
long int line_num;
@@ -207,6 +288,7 @@ int load_model_results(string sample, map<int, ModRes *> &modres) {
}
int load_snp_calls(string sample, map<int, SNPRes *> &snpres) {
+ //// Deprecated
string f;
int id, samp_id;
vector<string> parts;
=====================================
src/sql_utilities.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2016, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -32,14 +32,20 @@
//
// The expected number of tab-separated fields in our SQL input files.
//
-const uint num_tags_fields = 9;
-const uint num_snps_fields = 9;
-const uint num_alleles_fields = 5;
-const uint num_matches_fields = 6;
-
-void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verbose=true);
-int load_model_results(string sample, map<int, ModRes *> &modres);
-int load_snp_calls(string sample, map<int, SNPRes *> &snpres);
+////const uint num_tags_fields = 9;
+////const uint num_snps_fields = 9;
+////const uint num_alleles_fields = 5;
+////const uint num_matches_fields = 6;
+const uint num_tags_fields = 8;
+const uint num_snps_fields = 8;
+const uint num_alleles_fields = 4;
+const uint num_matches_fields = 5;
+const uint num_samlist_fields = 2;
+
+void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verbose=true);
+int load_catalog_samples(string path, map<string, size_t> &seen_samples, size_t &max_sample_id);
+int load_model_results(string sample, map<int, ModRes *> &modres);
+int load_snp_calls(string sample, map<int, SNPRes *> &snpres);
// retrieve_bijective_sloci()
// ----------
@@ -126,9 +132,9 @@ load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool
return 0;
}
- id = atoi(parts[1].c_str());
+ id = atoi(parts[0].c_str());
- if (parts[2] != "consensus") {
+ if (parts[1] != "consensus") {
if (blacklisted.count(id)) continue;
//
@@ -140,9 +146,9 @@ load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool
// Read the model sequence, a series of letters specifying if the model called a
// homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
//
- if (parts[2] == "model") {
- loci[id]->model = new char[parts[5].length() + 1];
- strcpy(loci[id]->model, parts[5].c_str());
+ if (parts[1] == "model") {
+ loci[id]->model = new char[parts[4].length() + 1];
+ strcpy(loci[id]->model, parts[4].c_str());
} else {
//
@@ -153,23 +159,23 @@ load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool
if (store_reads >= 1) {
if (store_reads >= 2) {
// Load the actual sequences (otherwise don't).
- char *read = new char[parts[5].length() + 1];
- strcpy(read, parts[5].c_str());
+ char *read = new char[parts[4].length() + 1];
+ strcpy(read, parts[4].c_str());
loci[id]->reads.push_back(read);
}
- char *read_id = new char[parts[4].length() + 1];
- strcpy(read_id, parts[4].c_str());
+ char *read_id = new char[parts[3].length() + 1];
+ strcpy(read_id, parts[3].c_str());
loci[id]->comp.push_back(read_id);
//
// Store the internal stack number for this read.
//
- loci[id]->comp_cnt.push_back(atoi(parts[3].c_str()));
+ loci[id]->comp_cnt.push_back(atoi(parts[2].c_str()));
//
// Store the read type.
//
- if (parts[2] == "primary")
+ if (parts[1] == "primary")
loci[id]->comp_type.push_back(primary);
else
loci[id]->comp_type.push_back(secondary);
@@ -187,26 +193,26 @@ load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool
// Do not include blacklisted tags in the catalog. They are tags that are composed
// of noise and/or repetitive sequence.
//
- if (parts[7] == "1") {
+ if (parts[6] == "1") {
blacklisted.insert(id);
continue;
}
c = new LocusT;
- c->sample_id = atoi(parts[0].c_str());
+ ////c->sample_id = atoi(parts[0].c_str());
c->id = id;
- c->add_consensus(parts[5].c_str());
+ c->add_consensus(parts[4].c_str());
//
// Read in the flags
//
- c->deleveraged = (parts[6] == "1" ? true : false);
- c->lumberjackstack = (parts[8] == "1" ? true : false);
+ c->deleveraged = (parts[5] == "1" ? true : false);
+ c->lumberjackstack = (parts[7] == "1" ? true : false);
//
// Parse the components of this stack (either the Illumina ID, or the catalog constituents)
//
- q = parts[4].c_str();
+ q = parts[3].c_str();
while (*q != '\0') {
for (p = q; *q != ',' && *q != '\0'; q++);
len = q - p;
@@ -270,7 +276,7 @@ load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool
return 0;
}
- id = atoi(parts[1].c_str());
+ id = atoi(parts[0].c_str());
if (blacklisted.count(id))
continue;
@@ -278,31 +284,31 @@ load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool
//
// Only load heterozygous model calls.
//
- if (load_all_model_calls == false && parts[3] != "E")
+ if (load_all_model_calls == false && parts[2] != "E")
continue;
snp = new SNP;
- snp->col = atoi(parts[2].c_str());
- snp->lratio = atof(parts[4].c_str());
- snp->rank_1 = parts[5].at(0);
- snp->rank_2 = parts[6].at(0) == '-' ? 0 : parts[6].at(0);
+ snp->col = atoi(parts[1].c_str());
+ snp->lratio = atof(parts[3].c_str());
+ snp->rank_1 = parts[4].at(0);
+ snp->rank_2 = parts[5].at(0) == '-' ? 0 : parts[5].at(0);
- if (parts[3] == "E")
+ if (parts[2] == "E")
snp->type = snp_type_het;
- else if (parts[3] == "O")
+ else if (parts[2] == "O")
snp->type = snp_type_hom;
else
snp->type = snp_type_unk;
- if (parts[7].length() == 0 || parts[7].at(0) == '-')
+ if (parts[6].length() == 0 || parts[6].at(0) == '-')
snp->rank_3 = 0;
else
- snp->rank_3 = parts[7].at(0);
+ snp->rank_3 = parts[6].at(0);
- if (parts[8].length() == 0 || parts[8].at(0) == '-')
+ if (parts[7].length() == 0 || parts[7].at(0) == '-')
snp->rank_4 = 0;
else
- snp->rank_4 = parts[8].at(0);
+ snp->rank_4 = parts[7].at(0);
if (loci.count(id) > 0) {
loci[id]->snps.push_back(snp);
@@ -362,13 +368,13 @@ load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool
return 0;
}
- id = atoi(parts[1].c_str());
+ id = atoi(parts[0].c_str());
if (blacklisted.count(id))
continue;
if (loci.count(id) > 0) {
- loci[id]->alleles[parts[2]] = atoi(parts[4].c_str());
+ loci[id]->alleles[parts[1]] = atoi(parts[3].c_str());
} else {
cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". SNP asks for nonexistent locus with ID: " << id << "\n";
return 0;
=====================================
src/sstacks.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -1261,7 +1261,6 @@ write_matches(string sample_path, map<int, QLocus *> &sample)
qloc->alleles[qloc->matches[j]->cat_type] : qloc->depth;
sstr << qloc->matches[j]->cat_id << "\t"
- << samp_id << "\t"
<< qloc->id << "\t"
<< qloc->matches[j]->cat_type << "\t"
<< match_depth << "\t"
=====================================
src/tsv2bam.cc
=====================================
@@ -1,3 +1,22 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2016-2023, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
+//
#include <cmath>
#include <cstring>
#include <iostream>
@@ -21,7 +40,8 @@
void parse_command_line(int argc, char* argv[]);
void report_options(ostream& os);
int run();
-void run(const vector<int>& cloc_ids,
+void run(const map<string, size_t> &sample_ids,
+ const vector<int>& cloc_ids,
const string& header_sq_lines,
const unordered_map<int,size_t>& cloc_lengths,
size_t sample_i, ostream& os
@@ -31,13 +51,14 @@ void cigar_apply_to_locus(Locus* l, const Cigar& c);
//
// Argument globals.
//
-bool quiet = false;
-string in_dir;
+string in_dir;
vector<string> samples;
-string popmap_path;
+string popmap_path;
vector<string> pe_reads_paths;
-FileT pe_reads_format;
-int num_threads = 1;
+FileT pe_reads_format;
+
+bool quiet = false;
+int num_threads = 1;
bool dbg_reversed_pe_reads = false;
bool dbg_only_bijective_loci = false;
@@ -50,7 +71,9 @@ unique_ptr<LogAlterator> logger = NULL;
// main()
// ==========
-int main(int argc, char* argv[]) {
+int
+main(int argc, char* argv[])
+{
IF_NDEBUG_TRY
// Parse arguments
@@ -80,26 +103,46 @@ int main(int argc, char* argv[]) {
IF_NDEBUG_CATCH_ALL_EXCEPTIONS
}
-int run() {
-
+int
+run()
+{
//
// Read the catalog.
//
cout << "Loading the catalog..." << endl;
- map<int, Locus*> catalog;
+ map<int, Locus*> catalog;
+ map<string, size_t> sample_ids;
+
string catalog_prefix = in_dir + "catalog";
- bool dummy;
- int rv = load_loci(catalog_prefix, catalog, 0, false, dummy, false);
- if (rv != 1) {
+ bool compressed;
+ size_t max_sample_id = 0;
+ size_t next_sample_id = 1;
+
+ if (!load_loci(catalog_prefix, catalog, 0, false, compressed, false)) {
cerr << "Error: Unable to load catalog '" << catalog_prefix << "'.\n";
throw exception();
}
-
+ //
+ // Assign arbitrary sample IDs to the incomming samples, respecting the IDs already assigend by cstacks.
+ // Since not all samples have to be part of the catalog, we may have additional samples now being
+ // processed that are not in the catalog and do not have an existing ID.
+ //
+ if (!load_catalog_samples(catalog_prefix, sample_ids, max_sample_id)) {
+ cerr << "Error: Unable to load catalog sample list '" << catalog_prefix << "'.\n";
+ throw exception();
+ }
+ next_sample_id = max_sample_id + 1;
+ for (size_t i = 0; i < samples.size(); ++i)
+ if (sample_ids.count(samples[i]) == 0) {
+ sample_ids[samples[i]] = next_sample_id;
+ next_sample_id++;
+ }
+
//
// Create the BAM target list.
//
stringstream header_sq_lines_ss;
- vector<int> cloc_ids;
+ vector<int> cloc_ids;
unordered_map<int,size_t> cloc_lengths;
cloc_ids.reserve(catalog.size());
for (auto& cloc : catalog) {
@@ -125,7 +168,7 @@ int run() {
#pragma omp parallel
{
#pragma omp for schedule(dynamic)
- for (size_t i=0; i<samples.size(); ++i) {
+ for (size_t i = 0; i < samples.size(); ++i) {
if (omp_return != 0)
continue;
try {
@@ -133,7 +176,7 @@ int run() {
cout << "Processing sample '" << samples[i] << "'...\n" << flush;
stringstream ss;
- run(cloc_ids, header_sq_lines, cloc_lengths, i, ss);
+ run(sample_ids, cloc_ids, header_sq_lines, cloc_lengths, i, ss);
outputs[i] = ss.str();
} catch (exception& e) {
@@ -155,12 +198,14 @@ int run() {
return 0;
}
-void run(const vector<int>& cloc_ids,
+void
+run(const map<string, size_t> &sample_ids,
+ const vector<int>& cloc_ids,
const string& header_sq_lines,
const unordered_map<int,size_t>& cloc_lengths,
size_t sample_i,
- ostream& os
-) {
+ ostream& os)
+{
ostream& cout = os;
cout << "Sample '" << samples.at(sample_i) << "':";
@@ -171,7 +216,7 @@ void run(const vector<int>& cloc_ids,
//
map<int, Locus*> sloci;
vector<CatMatch*> matches;
- int sample_id = -1;
+ int sample_id = sample_ids.at(samples.at(sample_i));
{
load_catalog_matches(prefix_path, matches, false);
if (matches.empty()) {
@@ -179,7 +224,7 @@ void run(const vector<int>& cloc_ids,
<< prefix_path + ".matches.tsv(.gz)'.\n";
throw exception();
}
- sample_id = matches[0]->sample_id;
+ matches[0]->sample_id = sample_id;
bool dummy;
int rv = load_loci(prefix_path, sloci, 2, false, dummy, false);
=====================================
src/ustacks.cc
=====================================
@@ -71,7 +71,6 @@ try{
cerr << "ustacks parameters selected:\n"
<< " Input file: '" << in_file << "'\n"
- << " Sample ID: " << sample_id << "\n"
<< " Min depth of coverage to create a stack (m): " << min_merge_cov << "\n"
<< " Repeat removal algorithm: " << (remove_rep_stacks ? "enabled" : "disabled") << "\n"
<< " Max distance allowed between stacks (M): " << max_utag_dist << "\n"
@@ -185,7 +184,7 @@ try{
//
// Assemble loci (merge primary stacks).
//
- cerr << "Assembling stacks (max. dist. M=" << max_utag_dist << ")...\n";
+ cerr << "Assembling stacks (max dist. M=" << max_utag_dist << ")...\n";
calc_kmer_distance(merged, max_utag_dist);
size_t n_blacklisted;
merge_stacks(merged, n_blacklisted);
@@ -200,7 +199,7 @@ try{
//
// Merge secondary stacks.
//
- cerr << "Merging secondary stacks (max. dist. N=" << max_rem_dist << " from consensus)...\n";
+ cerr << "Merging secondary stacks (max dist. N=" << max_rem_dist << " from consensus)...\n";
size_t utilized = merge_remainders(merged, unique, remainders);
call_consensus(merged, unique, remainders, false);
size_t util_gapped = merge_gapped_remainders(merged, unique, remainders);
@@ -2378,8 +2377,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
tag_1->calc_likelihood();
// First write the consensus sequence
- tags << sample_id << "\t"
- << tag_1->id << "\t"
+ tags << tag_1->id << "\t"
<< "consensus\t" << "\t"
<< "\t"
<< tag_1->con << "\t"
@@ -2390,8 +2388,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
//
// Write a sequence recording the output of the SNP model for each nucleotide.
//
- tags << sample_id << "\t"
- << tag_1->id << "\t"
+ tags << tag_1->id << "\t"
<< "model\t" << "\t"
<< "\t";
for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
@@ -2418,8 +2415,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
total += tag_2->count();
for (uint j = 0; j < tag_2->map.size(); j++) {
- tags << sample_id << "\t"
- << tag_1->id << "\t"
+ tags << tag_1->id << "\t"
<< "primary\t"
<< id << "\t"
<< seq_ids[tag_2->map[j]] << "\t"
@@ -2436,8 +2432,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
rem = r[*k];
total += rem->map.size();
for (uint j = 0; j < rem->map.size(); j++)
- tags << sample_id << "\t"
- << tag_1->id << "\t"
+ tags << tag_1->id << "\t"
<< "secondary\t"
<< "\t"
<< seq_ids[rem->map[j]] << "\t"
@@ -2449,8 +2444,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
// Write out the model calls for each nucleotide in this locus.
//
for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
- snps << sample_id << "\t"
- << tag_1->id << "\t"
+ snps << tag_1->id << "\t"
<< (size_t) (*s)->col << "\t";
switch((*s)->type) {
case snp_type_het:
@@ -2474,8 +2468,7 @@ write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *>
// the percentage of tags a particular allele occupies.
//
for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
- alle << sample_id << "\t"
- << tag_1->id << "\t"
+ alle << tag_1->id << "\t"
<< (*t).first << "\t";
sprintf(strbuf32, "%.2f", ((*t).second/total) * 100);
alle << strbuf32 << "\t"
@@ -2859,7 +2852,6 @@ int parse_command_line(int argc, char* argv[]) {
sample_name = optarg;
break;
case 'i':
- sample_id = is_integer(optarg);
break;
case 'm':
min_merge_cov = is_integer(optarg);
@@ -3016,11 +3008,6 @@ int parse_command_line(int argc, char* argv[]) {
}
prefix_path = out_path + sample_name;
- if (sample_id < 0) {
- cerr << "A unique sample ID must be provided (a positive integer).\n";
- help();
- }
-
if (model_type == ::fixed && barcode_err_freq == 0) {
cerr << "You must specify the barcode error frequency.\n";
help();
@@ -3037,9 +3024,8 @@ void version() {
void help() {
cerr << "ustacks " << VERSION << "\n"
- << "ustacks -f file_path -i id -o path [-M max_dist] [-m min_cov] [-p num_threads]" << "\n"
+ << "ustacks -f file_path -o path [-M max_dist] [-m min_cov] [-p num_threads]" << "\n"
<< " f: input file path.\n"
- << " i: a unique integer ID for this sample.\n"
<< " o: output path to write results.\n"
<< " M: Maximum distance (in nucleotides) allowed between stacks (default 2).\n"
<< " m: Minimum depth of coverage required to create a stack (default 3).\n"
View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/eaf87a65c105869ea307fe13c0b77c60692e1c80
--
View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/eaf87a65c105869ea307fe13c0b77c60692e1c80
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230829/2c9d293b/attachment-0001.htm>
More information about the debian-med-commit
mailing list