[med-svn] [Git][med-team/stacks][master] 3 commits: New upstream version 2.59+dfsg

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Wed Jul 14 16:31:11 BST 2021



Nilesh Patra pushed to branch master at Debian Med / stacks


Commits:
c3578770 by Nilesh Patra at 2021-07-14T20:41:48+05:30
New upstream version 2.59+dfsg
- - - - -
36c79f47 by Nilesh Patra at 2021-07-14T20:41:50+05:30
Update upstream source from tag 'upstream/2.59+dfsg'

Update to upstream version '2.59+dfsg'
with Debian dir c6c27203ba8c555e7ca4e355c36d28c3a2db71de
- - - - -
5c517ee9 by Nilesh Patra at 2021-07-14T20:42:11+05:30
Interim changelog entry

- - - - -


19 changed files:

- ChangeLog
- configure.ac
- debian/changelog
- scripts/stacks-integrate-alignments
- src/MetaPopInfo.h
- src/Vcf.cc
- src/Vcf.h
- src/debruijn.cc
- src/debruijn.h
- src/export_formats.cc
- src/gstacks.cc
- src/gzFasta.cc
- src/gzFasta.h
- src/locus.cc
- src/locus.h
- src/models.h
- src/populations.cc
- src/populations.h
- src/process_radtags.cc


Changes:

=====================================
ChangeLog
=====================================
@@ -1,9 +1,23 @@
-Stacks 2.58 - June, 08, 2021
+Stacks 2.59 - July 13, 2021
+---------------------------
+    Feature: updated populations to output the number of missing sites and loci, per sample to the
+             populations.log.distribs file.
+    Feature: replaced stacks-integrate-alignments with a new Python program. This new program allows for greater
+             filtering of alignments and more error checking for alignments where an fragment alignment could
+             associate SNPs within loci that had non-existent coordinates (on the reference genome).
+    Feature: updated populations to look for and if found, load the file 'catalog.chrs.tsv' in the Stacks output
+             directory. This is then exported as part of the VCF headers to supply contig names/lengths.
+    Feature: updated the process_radtags log file to have similar headers to the *.distribs files and the ability
+             to extract portions of the log with stacks-dist-extract utility.
+    Bugfix:  In the populations program, the phylip-all export would throw an error (or misprint the sequences)
+             if the population names were of different lengths.
+
+Stacks 2.58 - June 08, 2021
 ----------------------------
-    Bugfix: Fixed several memory errors in ustacks related to processing trimmed reads. 
+    Bugfix: Fixed several memory errors in ustacks related to processing trimmed reads.
 
-Stacks 2.57 - May, 10, 2021
----------------------------
+Stacks 2.57 - May 10, 2021
+--------------------------
     Feature: updated process_radtags so that if you specify the same sample name in the barcodes file for multiple
              barcodes, the program will merge the output for those barcodes into the single, specified output file.
     Feature: changed the default 'smoothed' and 'bootstrap' values in output files to contain a -1.0 if a particular locus


=====================================
configure.ac
=====================================
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.59)
-AC_INIT([Stacks], [2.58])
+AC_INIT([Stacks], [2.59])
 AC_CONFIG_AUX_DIR([config])
 AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
 AC_CONFIG_SRCDIR([src/ustacks.cc])


=====================================
debian/changelog
=====================================
@@ -1,7 +1,7 @@
-stacks (2.58+dfsg-1) UNRELEASED; urgency=medium
+stacks (2.59+dfsg-1) UNRELEASED; urgency=medium
 
   * Team upload.
-  * New upstream version 2.58+dfsg
+  * New upstream version 2.59+dfsg
 
  -- Nilesh Patra <nilesh at debian.org>  Sat, 15 May 2021 17:12:18 +0530
 


=====================================
scripts/stacks-integrate-alignments
=====================================
@@ -1,199 +1,681 @@
-#!/bin/bash
-
-usage="\
-Usage:
-  $(basename "$0") -P stacks_dir -B bam_file [-q min_MAPQ=20] -O out_dir
-
-Extracts the coordinates of the RAD loci from the given BAM file into a
-'locus_coordinates.tsv' table, then rewrites the 'catalog.fa.gz' and
-'catalog.calls' files so that they include the genomic coordinates given in the
-input BAM file.
-"
-
-version="_VERSION_"
-
-# Check dependencies
-# ==========
-
-command -v python3 &>/dev/null  || { python3; echo "Error: Python 3 is required." >&2; exit 1; }
-command -v samtools &>/dev/null || { samtools; echo "Error: Samtools is required." >&2; exit 1; }
-
-# Parse arguments
-# ==========
-
-# If there aren't any arguments, print the help.
-[[ $# -gt 0 ]] || { echo -n "$usage"; exit 1; }
-
-# Check that there aren't weird characters in the input.
-for arg in "$@" ;do
-    if [[ "$arg" =~ [^[:alnum:]_.,/~+=:-] ]] ;then
-        echo "Error: illegal character in argument '$arg'." >&2
-        exit 1
-    fi
-done
-
-# Parse arguments.
-min_mapq=20
-while getopts 'P:B:O:q:hv' opt ;do
-    case "$opt" in
-    P) stacks_d="${OPTARG/%\//}" ;;
-    B) bam_f="${OPTARG/%\//}" ;;
-    O) out_d="${OPTARG/%\//}" ;;
-    q) min_mapq="$OPTARG" ;;
-    h) echo -n "$usage" >&2; exit 1 ;;
-    v) echo "$version" >&2; exit 1 ;;
-    *) echo "Error: Bad arguments (-h for help)." >&2; exit 1 ;;
-    esac
-done
-shift $((OPTIND-1))
-if [[ $# -ne 0 ]] ;then
-    echo "Error: '$1' is seen as a positional argument (expected no positional arguments)." >&2
-    exit 1
-fi
-
-# Check them.
-[[ "$stacks_d" ]] || { echo "Error: -P is required." >&2; exit 1; }
-[[ "$bam_f" ]]    || { echo "Error: -B is required." >&2; exit 1; }
-[[ "$out_d" ]]    || { echo "Error: -O is required." >&2; exit 1; }
-[[ "$min_mapq" =~ ^[0-9]+$ ]] || { echo "Error: -q expects a positive integer (received '$min_mapq')." >&2; exit 1; }
-
-[[ -e "$stacks_d" ]] || { ls -- "$stacks_d"; exit 1; }
-[[ -e "$bam_f" ]]    || { ls -- "$bam_f"; exit 1; }
-
-fa="$stacks_d/catalog.fa.gz"
-vcf="$stacks_d/catalog.calls"
-[[ -e "$fa" ]]  || { ls -- "$fa"; exit 1; }
-[[ -e "$vcf" ]] || { ls -- "$vcf"; exit 1; }
-
-o_coords="$out_d/locus_coordinates.tsv"
-o_fa="$out_d/catalog.fa.gz"
-o_vcf="$out_d/catalog.calls"
-[[ ! -e "$o_coords" ]]  || { echo "Error: Refusing to overwrite '$o_coords'." >&2; exit 1; }
-[[ ! -e "$o_fa" ]]  || { echo "Error: Refusing to overwrite '$o_fa'." >&2; exit 1; }
-[[ ! -e "$o_vcf" ]] || { echo "Error: Refusing to overwrite '$o_vcf'." >&2; exit 1; }
-
-# If something goes wrong, stop & print an error.
-set -e -o pipefail
-trap "echo 'Error: aborted.' >&2" EXIT
-
-# Make sure we have write permissions.
-mkdir -p "$out_d"
-touch "$o_fa" "$o_vcf"
-
-# Retrieve the highest ID in the catalog.
-# ==========
-
-echo "Finding the highest current locus ID..."
-id=$(gzip -cd "$fa" | tail -n2 | head -n1 | awk '{print $1}')
-if [[ ! "$id" =~ ^\>[0-9]+$ ]] ;then
-    echo "Error: Unexpected format in '$fa'; the second to last line was:" >&2
-    gzip -cd "$fa" | tail -n2 | head -n1
-    exit 1
-fi
-id=$(echo "$id" | cut -c2-)
-echo "$id"
-echo
-
-# Write the old_id : new_id : pos file.
-# ==========
-
-echo "Extracting locus coordinates..."
-samtools view -F 0x904 -q "$min_mapq" "$bam_f" |
-    # Convert SAM to `LOCUS_ID \t CHR \t POS \t STRAND`.
-    awk -F '\t' -v OFS='\t' '{
-            if (int( $2 % (2*16) / 16 )) {
-                # Negative strand.
-                pos = $4 - 1
-                gsub(/[0-9]+[ISH]/, "", $6)
-                gsub(/[A-Z]/, ",", $6)
-                n = split(substr($6, 1, length($6)-1), cig, ",")
-                for (i=1; i<=n; ++i) { pos += cig[i] }
-                print $1, $3, pos, "-"
-            } else {
-                print $1, $3, $4, "+"
-            }
-        }' |
-    # Sort by chromosome.
-    sort -k2,2 |
-    # Pull the reference chromosome order from the BAM header.
-    join -t $'\t' -12 -22 -o "1.1 1.2 2.1 1.3 1.4" \
-        - \
-        <(samtools view -H "$bam_f" |
-            grep '^@SQ' | grep -oE $'SN:[^\t]+' | cut -c4- |
-            awk '{printf NR"\t"$1"\n"}' |
-            sort -k2,2
-        ) |
-    # Sort by chromosome index, bp & strand, then remove the index.
-    sort -k3,3n -k4,4n -k5,5r | cut -f 1,2,4,5 |
-    # Create the new IDs (we start at the power of 10 above the current max).
-    awk 'BEGIN{id = '"$id"' + 1} {printf id++ "\t" $1 "\t" $2 ":" $3 ":" $4 "\n"}' |
-    # Add a header.
-    { echo -e 'id_new\tid_old\taln_pos'; cat; } \
-    > "$o_coords"
-
-[[ $(wc -l < "$o_coords") -gt 1 ]] || { echo "Error: Extraction of coordinates failed; check BAM file." >&2; exit 1; }
-echo "Wrote '$o_coords'."
-echo
-
-# Write the new FASTA file.
-# ==========
-
-echo "Rewriting locus sequences and information..."
-gzip -cd "$fa" |
-    tr -d '\r' | paste -d '\r' - - | sed 's/^>//' |
-    awk -F' ' -v OFS=' ' '
-        BEGIN{
-            while(getline < "'"$o_coords"'") {
-                if (! /^id_new/) {
-                    old2new[$2] = $1
-                    old2pos[$2] = $3
-                }
-            }
-        } {
-            old_id = $1
-            if (old_id in old2new) {
-                $1 = old2new[old_id] " pos=" old2pos[old_id]
-                print
-            }
-        }' |
-    sort -k1,1n |
-    sed 's/^/>/' | tr '\r' '\n' |
-    gzip \
-    > "$o_fa"
-
-echo "Wrote '$o_fa'."
-echo
-
-# Write the new VCF file.
-# ==========
-
-echo "Rewriting variants..."
-{
-# Copy the header.
-{ gzip -cd "$vcf" 2>/dev/null || true; } | sed '/^#CHROM/ q' # (gzip unhappy about the closed pipe).
-
-# Update & resort the records.
-gzip -cd "$vcf" | grep -v '^#' |
-    awk -F'\t' -v OFS='\t' '
-        BEGIN{
-            while(getline < "'"$o_coords"'") {
-                if (! /^id_new/) {
-                    old2new[$2] = $1
-                }
-            }
-        } {
-            old_id = $1
-            if (old_id in old2new) {
-                $1 = old2new[old_id]
-                print
-            }
-       }' |
-    sort -k1,1n -k2,2n
-} | gzip \
-> "$o_vcf"
-
-echo "Wrote '$o_vcf'."
-echo
-
-trap - EXIT
-echo "$(basename "$0") is done."
+#!/usr/bin/env python3
+#
+# Copyright 2021, Julian Catchen <jcatchen at illinois.edu>
+#
+# This file is part of Stacks.
+#
+# Stacks is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Stacks is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys
+import argparse
+import os
+import gzip
+from datetime import datetime
+from enum import Enum
+from operator import itemgetter, attrgetter
+
+version     = '_VERSION_'
+stacks_path = ""
+bam_path    = ""
+out_path    = ""
+
+min_pct_id  = 0.6  # Minimum BLAST-style percent identity to keep an alignment.
+min_aln_cov = 0.6  # Minimum fraction of the read participating in the alignment.
+min_mapq    = 20   # Minimum mapping quality required to keep an alignment.
+verbose     = False
+
+def parse_cigar(c_str):
+    cigar = []
+
+    cigar_len = len(c_str)
+    i         = 0
+    while i < cigar_len:
+        j = i
+        while c_str[j].isdigit():
+            j += 1
+        cigar.append( (c_str[j], int(c_str[i:j])) )
+        i = j + 1
+
+    return cigar
+
+
+class AlnType(Enum):
+    unmapped      = 4
+    primary       = 2
+    secondary     = 256
+    supplementary = 2048
+
+
+class Aln:
+    """
+    Object to hold the alignment of one read to the reference.
+    """
+    def __init__(self, chr, bp, strand, alntype, mapq, cigar, edit_dist):
+        self.chr       = chr
+        self.bp        = bp
+        self.type      = alntype
+        self.strand    = strand  # Strand alignment is mapped to; 1 = positive, 0 = negative.
+        self.mapq      = mapq    # Mapping quality score.
+        self.edit_dist = 0       # Edit distance of the query to the reference.
+        self.q_len     = 0       # Length of the query read.
+        self.pct_id    = 0.0     # BLAST-style percent identity between the query and reference.
+        self.aln_pct   = 0.0     # Percentage of the query length participating in the alignment.
+        self.cigar_str = cigar
+        self.cigar     = parse_cigar(cigar)
+
+        #
+        # Calculate alignment block length from the CIGAR string.
+        #
+        query_len     = 0.0
+        aln_blk_len   = 0.0
+        query_aln_len = 0.0
+        ref_aln_len   = 0
+        for cig_op, op_len in self.cigar:
+            if cig_op == "H" or cig_op == "S":
+                query_len     += op_len
+            elif cig_op == "M":
+                query_len     += op_len
+                aln_blk_len   += op_len
+                query_aln_len += op_len
+                ref_aln_len   += op_len
+            elif cig_op == "I":
+                query_len     += op_len
+                aln_blk_len   += op_len
+                query_aln_len += op_len
+            elif cig_op == "D":
+                aln_blk_len   += op_len
+                ref_aln_len   += op_len
+
+        self.q_len   = query_len
+        self.r_len   = ref_aln_len
+        self.pct_id  = (aln_blk_len - float(edit_dist)) / aln_blk_len
+        self.aln_pct = query_aln_len / query_len
+
+class Locus:
+    def __init__(self, id):
+        self.id_old   = id
+        self.id_new   = -1
+        self.pri_aln  = None
+        self.num_samples = 0
+        self.contig      = ""
+        self.sequence    = ""
+
+def parse_command_line():
+    global min_pct_id, min_aln_cov, min_mapq
+    global stacks_path, bam_path, out_path
+    global verbose, version
+
+    desc = """Extracts the coordinates of the RAD loci from the given BAM file into a \
+    'locus_coordinates.tsv' table, then rewrites the 'catalog.fa.gz' and \
+    'catalog.calls' files so that they include the genomic coordinates given in the \
+    input BAM file."""
+
+    p = argparse.ArgumentParser(description=desc)
+
+    #
+    # Add options.
+    #
+    p.add_argument("-P","--in-path", type=str, dest="stacks_path", required=True,
+                   help="Path to a directory containing Stacks ouput files.")
+    p.add_argument("-B","--bam-path", type=str, dest="bam_path", required=True,
+                   help="Path to a SAM or BAM file containing alignment of de novo catalog loci to a reference genome.")
+    p.add_argument("-O","--out-path", type=str, dest="out_path", required=True,
+                   help="Path to write the integrated ouput files.")
+    p.add_argument("-q", "--min_mapq", type=int,
+                   help="Minimum mapping quality as listed in the BAM file (default 20).")
+    p.add_argument("-a", "--min_alncov", type=float,
+                   help="Minimum fraction of the de novo catalog locus that must participate in the alignment (default 0.6).")
+    p.add_argument("-p", "--min_pctid", type=float,
+                   help="Minimum BLAST-style percent identity of the largest alignment fragment for a de novo catalog locus (default 0.6).")
+    p.add_argument("--verbose", action="store_true", dest="verbose",
+                   help="Provide verbose output.")
+    p.add_argument("--version", action='version', version=version)
+
+    #
+    # Parse the command line
+    #
+    args = p.parse_args()
+
+    if args.min_pctid != None:
+        min_pct_id = args.min_pctid
+        if min_pct_id > 1 and min_pct_id <= 100:
+            min_pct_id = min_pct_id / 100
+    if args.min_alncov != None:
+        min_aln_cov = args.min_alncov
+        if min_aln_cov > 1 and min_aln_cov <= 100:
+            min_aln_cov = min_aln_cov / 100
+    if args.min_mapq != None:
+        min_mapq = args.min_mapq
+    if args.stacks_path != None:
+        stacks_path = args.stacks_path
+    if args.bam_path != None:
+        bam_path = args.bam_path
+    if args.out_path != None:
+        out_path = args.out_path
+    if args.verbose != None:
+        verbose = args.verbose
+
+    if len(stacks_path) == 0 or os.path.exists(stacks_path) == False:
+        print("Error: you must specify a valid path to the de novo Stacks output directory.", file=sys.stderr)
+        p.print_help()
+        sys.exit()
+
+    if len(bam_path) == 0 or os.path.exists(bam_path) == False:
+        print("Error: you must specify a valid path to SAM or BAM file containing alignmnets of de novo catalog loci to a reference genome.", file=sys.stderr)
+        p.print_help()
+        sys.exit()
+
+    if len(out_path) == 0 or os.path.exists(out_path) == False:
+        print("Error: you must specify a valid/existing path to a directory to write the integrated output files.", file=sys.stderr)
+        p.print_help()
+        sys.exit()
+
+    if out_path[-1] != "/":
+        out_path += "/"
+
+    if stacks_path[-1] != "/":
+        stacks_path += "/"
+
+
+def find_max_locus_id(stacks_path):
+
+    fh = gzip.open(stacks_path + "catalog.fa.gz", 'rt')
+
+    max_locus_id = 0
+    
+    for line in fh:
+        line = line.strip('\n')
+
+        if line[0] != ">":
+            continue
+
+        parts    = line.split(' ')
+        locus_id = int(parts[0][1:])
+
+        if locus_id > max_locus_id:
+            max_locus_id = locus_id
+    fh.close()
+
+    return max_locus_id
+
+
+def parse_bam_file(path, loci, chrs):
+    #
+    # Pull a list of chromosomes out of the BAM header.
+    #
+    cmd = "samtools view -H " + path
+    fh = os.popen(cmd, "r")
+
+    for line in fh:
+        line  = line.strip("\n")
+        parts = line.split("\t")
+
+        if parts[0] != "@SQ":
+            continue
+
+        chrs.append( (parts[1][3:], int(parts[2][3:])) )
+    fh.close()
+
+    #
+    # Sort chromosomes by size.
+    #
+    chrs.sort(key=itemgetter(1), reverse=True)
+
+    tot_loc = {}
+    tot_aln = 0
+    pri_cnt = 0
+    sec_cnt = 0
+    sup_cnt = 0
+    unm_cnt = 0
+    
+    cmd = "samtools view " + path
+    fh = os.popen(cmd, "r")
+
+    for line in fh:
+        line  = line.strip("\n")
+        parts = line.split("\t")
+
+        loc_id = int(parts[0])
+        chr    = parts[2]
+        bp     = int(parts[3])
+        mapq   = int(parts[4])
+        cigar  = parts[5]
+
+        edit_dist = 0
+        for part in parts:
+            if part[0:5] == "NM:i:":
+                edit_dist = int(part[5:])
+
+        tot_aln += 1
+
+        if loc_id not in tot_loc:
+            tot_loc[loc_id] = 0
+
+        #
+        # Unmapped, primary, secondary, or supplementary alignment?
+        #
+        flag = int(parts[1])
+
+        if ((flag & AlnType.secondary.value) == AlnType.secondary.value):
+            atype = AlnType.secondary
+            sec_cnt += 1
+            tot_loc[loc_id] += 1
+            continue
+        elif ((flag & AlnType.supplementary.value) == AlnType.supplementary.value):
+            atype = AlnType.supplementary
+            sup_cnt += 1
+            tot_loc[loc_id] += 1
+            continue
+        elif ((flag & AlnType.unmapped.value) == AlnType.unmapped.value):
+            unm_cnt += 1
+            continue
+        else:
+            atype = AlnType.primary
+            pri_cnt += 1
+            tot_loc[loc_id] += 1
+
+        #
+        # Mapped to the negative strand?
+        #
+        if ((flag & 16) == 16):
+            strand = 0
+        else:
+            strand = 1
+
+        aln = Aln(chr, bp, strand, atype, mapq, cigar, edit_dist)
+
+        loc = Locus(loc_id)
+        loc.pri_aln = aln
+        if loc_id in loci:
+            print("  Warning: locus {} has more than one primary alignment.".format(locus), file=sys.stderr)
+        else:
+            loci[loc_id] = loc
+        
+    fh.close()
+
+    loc_unmapped = 0
+    loc_onemap   = 0
+    loc_multimap = 0
+    for loc_id in tot_loc:
+        if tot_loc[loc_id] == 0:
+            loc_unmapped += 1
+        elif tot_loc[loc_id] == 1:
+            loc_onemap += 1
+        else:
+            loc_multimap += 1
+            
+    print("  Read {} total alignments: {} mapped; {} unmapped; {} secondary alignments; {} supplementary alignments.".format(
+        tot_aln, pri_cnt, unm_cnt, sec_cnt, sup_cnt), file=sys.stderr)
+    print("  {} total de novo catalog loci seen; {} had multiple alignments; {} had one alignment; {} were unmapped.".format(
+        len(tot_loc), loc_multimap, loc_onemap, loc_unmapped), file=sys.stderr)
+
+
+def filter_alns(loci):
+    #
+    # Filter the alignments by MAPQ, percent identity, and alignment coverage.
+    #
+    mapq_filt   = 0
+    pctid_filt  = 0
+    alncov_filt = 0
+    loc_rem     = 0
+    for loc_id in loci:
+
+        if loci[loc_id].pri_aln.mapq < min_mapq:
+            mapq_filt += 1
+            loci[loc_id].pri_aln = None
+        elif loci[loc_id].pri_aln.aln_pct < min_aln_cov:
+            alncov_filt += 1
+            loci[loc_id].pri_aln = None
+        elif loci[loc_id].pri_aln.pct_id < min_pct_id:
+            pctid_filt += 1
+            loci[loc_id].pri_aln = None
+        else:
+            loc_rem += 1
+
+    tot_filt = mapq_filt + pctid_filt + alncov_filt
+
+    print("  {} loci filtered; {} due to mapping quality (mapq); {} due to alignment coverage; {} due to percent identity; {} loci remain.".format(
+        tot_filt, mapq_filt, alncov_filt, pctid_filt, loc_rem), file=sys.stderr)
+
+
+def check_locus_bounds(loci, chrs):
+    #
+    # Translate coordinates on the negative strand and check locus bounds.
+    #
+    chr_key = {}
+    for chr, clen in chrs:
+        chr_key[chr] = clen
+
+    art_corrected = 0
+    ref_too_small = 0
+    
+    for loc_id in loci:
+        loc = loci[loc_id]
+
+        if loc.pri_aln == None:
+            continue
+
+        #
+        # Adjust the alignmnet position of negative strand alignments to the RAD cutsite -- the 3' end.
+        #
+        if loc.pri_aln.strand == 0:
+            loc.pri_aln.bp += loc.pri_aln.r_len
+            position = "{}:{}:-".format(loc.pri_aln.chr, int(loc.pri_aln.bp))
+        else:
+            position = "{}:{}:+".format(loc.pri_aln.chr, int(loc.pri_aln.bp))
+        loc.position = position
+
+        #
+        # Check that locus does not reach boundaries of the reference scaffold/chromosome
+        #
+        if loc.pri_aln.strand == 0:
+            if loc.pri_aln.bp - loc.pri_aln.q_len < 0:
+                #
+                # The aligned locus extends prior to the start of the reference scaffold. Without
+                # correction, this could result in Stacks translating SNP coordinates into this
+                # 'negative' space.
+                #
+                loc.pri_aln.bp = loc.pri_aln.bp + (loc.pri_aln.q_len - loc.pri_aln.bp)
+                if loc.pri_aln.bp > chr_key[loc.pri_aln.chr]:
+                    if verbose:
+                        print("Locus {}, aligned to {} [CIGAR {}] extends past the starting bound; reference is too small to correct, removing alignment.".format(
+                            loc.id_old, position, loc.pri_aln.cigar_str), file=sys.stderr)
+                    loc.pri_aln = None
+                    ref_too_small += 1
+                    continue
+                else:
+                    art_corrected += 1
+                    loc.position   = "{}:{}:-".format(loc.pri_aln.chr, int(loc.pri_aln.bp))
+                    if verbose:
+                        print("Locus {}, aligned to {} [CIGAR {}] extends past the starting bound; artifically adjusting to {} to prevent SNPs with negative basepair coordinates.".format(
+                            loc.id_old, position, loc.pri_aln.cigar_str, loc.position), file=sys.stderr)
+        else:
+            if loc.pri_aln.bp + loc.pri_aln.q_len > chr_key[loc.pri_aln.chr]:
+                #
+                # The aligned locus extends out past the end of the reference scaffold. Without
+                # correction, this could result in Stacks translating SNP coordinates into this
+                # 'positive' space.
+                #
+                loc.pri_aln.bp = loc.pri_aln.bp - (loc.pri_aln.q_len - loc.pri_aln.r_len) + 1
+                if loc.pri_aln.bp < 0:
+                    if verbose:
+                        print("Locus {}, aligned to {} [CIGAR {}] extends past the end bound; reference is too small to correct, removing alignment.".format(
+                            loc.id_old, position, loc.pri_aln.cigar_str), file=sys.stderr)
+                    loc.pri_aln = None
+                    ref_too_small += 1
+                    continue
+                else:
+                    art_corrected += 1
+                    loc.position   = "{}:{}:+".format(loc.pri_aln.chr, int(loc.pri_aln.bp))
+                    if verbose:
+                        print("Locus {}, aligned to {} [CIGAR {}] extends past the end bound; artifically adjusting to {} to prevent SNPs with non-existant basepair coordinates.".format(
+                            loc.id_old, position, loc.pri_aln.cigar_str, loc.position), file=sys.stderr)
+
+    print("  Artificially altered the alignment positions of {} loci; removed {} loci that could not be altered properly.".format(art_corrected, ref_too_small), file=sys.stderr)
+
+
+def write_catalog(loci, chrs, datestamp, new_locus_id, ordered_loci):
+    #
+    # Read in existing catalog sequences.
+    #
+    fh = gzip.open(stacks_path + "catalog.fa.gz", 'rt')
+
+    eof  = False
+    line = fh.readline()
+    if len(line) == 0:
+        return
+    line = line.strip('\n ')
+
+    while len(line) == 0 or line[0] == "#":
+        line = fh.readline()
+        if len(line) == 0:
+            return
+        line = line.strip('\n ')
+
+    while eof == False:
+        seq = ""
+
+        if line[0] == ">":
+            # Parse and store the comments from the ID line
+            parts  = line.split(' ')
+            loc_id = int(parts[0][1:])
+            ns     = parts[1]
+            if len(parts) > 2:
+                contig = parts[2]
+            line   = ""
+
+        while len(line) == 0 or line[0] != ">":
+            line = fh.readline()
+            if len(line) == 0:
+                eof = True
+                break
+            line = line.strip('\n ')
+            if len(line) == 0 or line[0] == "#":
+                continue
+            if line[0] != ">":
+                seq += line
+        #
+        # Record the final sequence.
+        #
+        if loc_id in loci:
+            loci[loc_id].num_samples = ns
+            loci[loc_id].contig      = contig
+            loci[loc_id].sequence    = seq
+
+    fh.close()
+    
+    #
+    # Sort the loci by chromosome, output from largest to smallest chromosome.
+    #
+    for loc_id in loci:
+        loc = loci[loc_id]
+
+        if loc.pri_aln == None:
+            continue
+
+        if loc.pri_aln.chr not in ordered_loci:
+            ordered_loci[loc.pri_aln.chr] = []
+        ordered_loci[loc.pri_aln.chr].append(loc)
+
+    for chr in ordered_loci:
+        ordered_loci[chr].sort(key=attrgetter("pri_aln.bp"))
+
+    #
+    # Assign a new locus ID ordered by alignment location.
+    #
+    new_id = new_locus_id
+    for chr, chrlen in chrs:
+        if chr not in ordered_loci:
+            continue
+        
+        for loc in ordered_loci[chr]:
+
+            if loc.pri_aln == None:
+                continue
+
+            loc.id_new = new_id
+            loci[loc.id_old].id_new = new_id
+            new_id += 1
+    
+    #
+    # Re-write the catalog FASTA file. Also write a file of chromosomes used.
+    #
+    o_fa    = out_path + "catalog.fa.gz"
+    out_fh  = gzip.open(o_fa, "wt")
+    out_fh.write("# Generated by stacks-integrate-alignments, version {}; date {}\n# {}\n".format(version, datestamp, " ".join(sys.argv)))
+    
+    chrs_fh = open(out_path + "catalog.chrs.tsv", "w")
+    chrs_fh.write("# Generated by stacks-integrate-alignments, version {}; date {}\n# {}\n".format(version, datestamp, " ".join(sys.argv)))
+    chrs_fh.write("# Chrom\tLength\n")
+    
+    for chr, chrlen in chrs:
+        if chr not in ordered_loci:
+            continue
+
+        chrs_fh.write("{}\t{}\n".format(chr, chrlen))
+        
+        for loc in ordered_loci[chr]:
+            out_fh.write(">{} pos={} {} {}\n{}\n".format(loc.id_new, loc.position, loc.num_samples, loc.contig, loc.sequence))
+
+    out_fh.close()
+    chrs_fh.close()
+    return o_fa
+
+
+def write_locus_coords(ordered_loci, chrs, datestamp, out_path):
+    #
+    # Translate coordinates on the negative strand and write a table of old/new locus IDs.
+    #
+    o_coords = out_path + "locus_coordinates.tsv"
+
+    out_fh = open(o_coords, "w")
+
+    out_fh.write("# Generated by stacks-integrate-alignments, version {}; date {}\n# {}\n".format(version, datestamp, " ".join(sys.argv)))
+    out_fh.write("# id_new\tid_old\taln_pos\n")
+
+    for chr, chrlen in chrs:
+        if chr not in ordered_loci:
+            continue
+
+        for loc in ordered_loci[chr]:
+            out_fh.write("{}\t{}\t{}\n".format(loc.id_new, loc.id_old, loc.position))
+
+    out_fh.close()
+    return o_coords
+
+    
+def write_catalog_calls(loci, chrs, stacks_path, out_path, datestamp):
+    #
+    # Read in existing catalog sequences.
+    #
+    fh = gzip.open(stacks_path + "catalog.calls", 'rt')
+
+    #
+    # Write the header.
+    #
+    out_fh = gzip.open(out_path + "catalog.calls", 'wt')
+
+    for line in fh:
+        if line[0:10] == "##filedate":
+            out_fh.write("##filedate={}\n".format(datestamp))
+            continue
+        elif line[0:8] == "##source":
+            out_fh.write("##source=\"Stacks v{}; {}\"\n".format(version, " ".join(sys.argv)))
+            continue
+        elif line[0] == "#":
+            out_fh.write(line)
+            continue
+        else:
+            break
+
+    out_fh.close()
+    
+    #
+    # Open the output pipe to sort and compress.
+    #
+    o_vcf   = out_path + "catalog.calls"
+    cmd     = "sort -k 1,1n -k 2,2n - | gzip >> " + o_vcf
+    out_fh  = os.popen(cmd, mode="w")
+
+    #
+    # Write the last line read above
+    #
+    line   = line.strip('\n')
+    parts  = line.split('\t')
+    loc_id = int(parts[0])
+
+    pre_loc = loc_id
+    loc_cnt = 0
+    col_cnt = 50
+
+    if loc_id in loci:
+        out_fh.write("{}\t{}\n".format(loci[loc_id].id_new, "\t".join(parts[1:])))
+
+    #
+    # Read from the input file, translate the locus ID, output to the pipe to sort and compress.
+    #
+    for line in fh:
+        line   = line.strip('\n')
+        parts  = line.split('\t')
+        loc_id = int(parts[0])
+        
+        if loc_id not in loci:
+            continue
+
+        if loci[loc_id].pri_aln == None:
+            continue
+
+        if loc_id != pre_loc:
+            pre_loc  = loc_id
+            if loc_cnt % 1000 == 0:
+                print(".", file=sys.stderr, end="", flush=True)
+                if col_cnt % 100 == 0:
+                    print("\n  ", file=sys.stderr, end="", flush=True)
+                col_cnt += 1
+            loc_cnt += 1
+
+        out_fh.write("{}\t{}\n".format(loci[loc_id].id_new, "\t".join(parts[1:])))
+
+    print("\n  Waiting for sort and gzip to finish...", file=sys.stderr)
+
+    fh.close()
+    out_fh.close()
+
+    return o_vcf
+
+
+##
+##------------------------------------------------------------------------------------------------##
+##
+
+loci = {}
+chrs = []
+
+parse_command_line()
+
+#
+# Find the maximum locus ID in the existing de novo catalog, and set our starting ID number for the integrated loci.
+#
+print("Finding the highest current locus ID... ", file=sys.stderr, end="")
+max_locus_id = find_max_locus_id(stacks_path)
+new_locus_id = max_locus_id + 1
+print(max_locus_id, file=sys.stderr)
+
+print("\nExtracting locus coordinates...", file=sys.stderr)
+parse_bam_file(bam_path, loci, chrs)
+
+print("\nFiltering alignments...", file=sys.stderr)
+filter_alns(loci)
+
+check_locus_bounds(loci, chrs)
+
+datestamp = datetime.now().strftime("%Y%m%d")
+
+ordered_loci = {}
+
+print("\nRewriting locus catalog sequences and information...", file=sys.stderr)
+o_fa = write_catalog(loci, chrs, datestamp, new_locus_id, ordered_loci)
+print("  Wrote " + o_fa, file=sys.stderr)
+
+o_coords = write_locus_coords(ordered_loci, chrs, datestamp, out_path)
+print("  Wrote " + o_coords, file=sys.stderr)
+
+print("\nReading, translating, and writing the catalog calls", file=sys.stderr, end="", flush=True)
+o_vcf = write_catalog_calls(loci, chrs, stacks_path, out_path, datestamp)
+print("  Wrote " + o_vcf, file=sys.stderr)
+
+pos      = sys.argv[0].rfind("/") + 1
+basename = sys.argv[0][pos:]
+print("\n{} is done.".format(basename))


=====================================
src/MetaPopInfo.h
=====================================
@@ -35,6 +35,12 @@ struct Group {
     static const string default_name;
 };
 
+struct Contig {
+    string name;
+    string length;
+    Contig(const string &n, const string &l) : name(n), length(l) {}
+};
+    
 /*
  * MetaPopInfo
  * Class for reprensenting a metapopulation : its samples, populations,
@@ -42,8 +48,9 @@ struct Group {
  */
 class MetaPopInfo {
     vector<Sample> samples_; //n.b. Samples are sorted primarily by population index, and secondarily by name.
-    vector<Pop> pops_;
-    vector<Group> groups_;
+    vector<Pop>    pops_;
+    vector<Group>  groups_;
+    vector<Contig> contigs_; // List of contig names and lengths for file exports.
 
     map<string,size_t> sample_indexes_; // Links a name with an index in [samples_].
     map<string,size_t> pop_indexes_;
@@ -86,15 +93,18 @@ public:
     size_t n_samples() const {return samples().size();}
     const vector<Pop>& pops() const {return pops_;}
     const vector<Group>& groups() const {return groups_;}
-
+    const vector<Contig>& contigs() const {return contigs_;}
+    
     const vector<size_t>& sample_indexes_orig_order() const {return sample_indexes_orig_order_;}
 
     size_t get_sample_index(const string& name, bool must_exist=true) const;
     size_t get_pop_index(const string& name) const {return pop_indexes_.at(name);}
     size_t get_group_index(const string& name) const {return group_indexes_.at(name);}
 
+    void   add_contig(const string &ctg_name, const string &ctg_len) { this->contigs_.push_back(Contig(ctg_name, ctg_len)); }
+    
     // Work with sample IDs. (IDs unicity is not enforced.)
-    void set_sample_id(size_t index, size_t id) {samples_.at(index).id = id; sample_indexes_by_id_[id] = index;}
+    void   set_sample_id(size_t index, size_t id) {samples_.at(index).id = id; sample_indexes_by_id_[id] = index;}
     size_t get_sample_index(const size_t& id) const {return sample_indexes_by_id_.at(id);}
 
     void status(ostream &fh);


=====================================
src/Vcf.cc
=====================================
@@ -1,3 +1,23 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
+//
+
 #include <ctime>
 
 #include "Vcf.h"
@@ -44,6 +64,32 @@ void VcfHeader::add_std_meta(const string& version) {
     add_meta(VcfMeta::predefs::format_GT);
 }
 
+void VcfHeader::add_std_meta_with_contigs(const vector<Contig> &contigs, const string& version) {
+    add_meta(VcfMeta("fileformat", version));
+
+    time_t t;
+    time(&t);
+    char date[9];
+    strftime(date, 9, "%Y%m%d", localtime(&t));
+    add_meta(VcfMeta("fileDate", date));
+
+    add_meta(VcfMeta("source", string("\"Stacks v") + VERSION + "\""));
+
+    for (auto &ctg: contigs)
+	add_meta(VcfMeta("contig", "<ID=" + ctg.name + ",length=" + ctg.length + ">"));
+    
+    add_meta(VcfMeta::predefs::info_AD);
+    add_meta(VcfMeta::predefs::info_AF);
+    add_meta(VcfMeta::predefs::info_DP);
+    add_meta(VcfMeta::predefs::info_NS);
+    add_meta(VcfMeta::predefs::format_AD);
+    add_meta(VcfMeta::predefs::format_DP);
+    add_meta(VcfMeta::predefs::format_HQ);
+    add_meta(VcfMeta::predefs::format_GL);
+    add_meta(VcfMeta::predefs::format_GQ);
+    add_meta(VcfMeta::predefs::format_GT);
+}
+
 void VcfRecord::assign(const char* rec, size_t len, const VcfHeader& header) {
     assert(rec[len-1] != '\n');
 


=====================================
src/Vcf.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2015, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -24,6 +24,7 @@
 #include "constants.h"
 #include "nucleotides.h"
 #include "utils.h"
+#include "MetaPopInfo.h"
 
 /*
  * VcfMeta
@@ -82,6 +83,9 @@ public:
     // Creates a standard header.
     void add_std_meta(const string& version = "VCFv4.2");
 
+    // Creates a standard header with contigs defined.
+    void add_std_meta_with_contigs(const vector<Contig> &, const string& version = "VCFv4.2");
+
     static const string std_fields;
 
     friend ostream& operator<< (ostream& os, const VcfHeader& h) {h.print(os); return os;}


=====================================
src/debruijn.cc
=====================================


=====================================
src/debruijn.h
=====================================


=====================================
src/export_formats.cc
=====================================
@@ -2187,10 +2187,27 @@ int
 PhylipVarAllExport::write_header()
 {
     char s[id_len];
-    
+
+    //
+    // Find the longest population name.
+    //
+    size_t max_nlen = 0;
+    for (size_t p = 0; p < this->_mpopi->pops().size(); ++p) {
+        const Pop& pop = this->_mpopi->pops()[p];
+        max_nlen = pop.name.length() > max_nlen ? pop.name.length() : max_nlen;
+    }
+    if (max_nlen > 10) max_nlen = 10;
+
     for (size_t p = 0; p < this->_mpopi->pops().size(); ++p) {
         const Pop& pop = this->_mpopi->pops()[p];
-        this->_outstrs[p] += pop.name.substr(0, 10) + "\t";
+        this->_outstrs[p] += pop.name.substr(0, 10);
+        //
+        // If the population name is less than the limit allowed, pad
+        // it with spaces so DNA sequences stay in phase when writing batches.
+        //
+        if (pop.name.length() < max_nlen)
+            this->_outstrs[p] += string(max_nlen - pop.name.length(), ' ');
+        this->_outstrs[p] += "\t";
     }
 
     sprintf(s, "% 11d", 0);
@@ -2216,17 +2233,6 @@ PhylipVarAllExport::write_batch(const vector<LocBin*>& loci)
         const LocBin*    loc = loci[k];
         const CSLocus*  cloc = loc->cloc;
 
-        // bool include;
-        // include = true;
-        // for (uint i = 0; i < cloc->snps.size(); i++) {
-        //     uint col = cloc->snps[i]->col;
-
-        //     if (t->nucs[col].allele_cnt != 2)
-        //         include = false;
-        // }
-        // if (!include)
-        //     continue;
-
         seq = new char[cloc->len + 1];
         strcpy(seq, cloc->con);
 
@@ -2236,9 +2242,6 @@ PhylipVarAllExport::write_batch(const vector<LocBin*>& loci)
             for (uint snp_index = 0; snp_index < cloc->snps.size(); snp_index++) {
                 uint col = cloc->snps[snp_index]->col;
 
-                // if (t->nucs[col].allele_cnt != 2)
-                //     continue;
-                
                 seq[col] = iupac_encode(s->nucs[col].p_nuc, s->nucs[col].q_nuc);
             }
 
@@ -2260,7 +2263,7 @@ PhylipVarAllExport::write_batch(const vector<LocBin*>& loci)
     size_t line_len = 120;
     size_t i = 0;
     size_t seql = this->_outstrs.begin()->second.length();
-    
+
     while (i < seql) {
         for (map<int, string>::iterator it = this->_outstrs.begin(); it != this->_outstrs.end(); it++) {
             this->_fh << it->second.substr(i, line_len) << "\n";
@@ -2279,7 +2282,7 @@ PhylipVarAllExport::write_batch(const vector<LocBin*>& loci)
         for (map<int, string>::iterator it = this->_outstrs.begin(); it != this->_outstrs.end(); it++)
             it->second = "";
     }
-    
+
     return 0;
 }
 
@@ -2513,7 +2516,15 @@ VcfExport::open(const MetaPopInfo *mpopi)
     this->_mpopi = mpopi;
 
     VcfHeader header;
-    header.add_std_meta();
+    //
+    // Check for the existence of a contigs definition file, include them in the VCF header if found.
+    //
+    const vector<Contig> &contigs = this->_mpopi->contigs();
+    if (contigs.size() > 0)
+	header.add_std_meta_with_contigs(contigs);
+    else
+	header.add_std_meta();
+    
     header.add_meta(VcfMeta::predefs::info_loc_strand);
     for(size_t s : this->_mpopi->sample_indexes_orig_order())
         header.add_sample(this->_mpopi->samples()[s].name);
@@ -2611,7 +2622,15 @@ VcfHapsExport::open(const MetaPopInfo *mpopi)
     this->_mpopi = mpopi;
 
     VcfHeader header;
-    header.add_std_meta();
+    //
+    // Check for the existence of a contigs definition file, include them in the VCF header if found.
+    //
+    const vector<Contig> &contigs = this->_mpopi->contigs();
+    if (contigs.size() > 0)
+	header.add_std_meta_with_contigs(contigs);
+    else
+	header.add_std_meta();
+
     header.add_meta(VcfMeta::predefs::info_loc_strand);
     for(size_t s : this->_mpopi->sample_indexes_orig_order())
         header.add_sample(this->_mpopi->samples()[s].name);
@@ -2621,8 +2640,11 @@ VcfHapsExport::open(const MetaPopInfo *mpopi)
     return 0;
 }
 
-int VcfHapsExport::write_batch(const vector<LocBin*>& loci){
+int
+VcfHapsExport::write_batch(const vector<LocBin*>& loci)
+{
     VcfRecord rec;
+
     for (const LocBin* locbin : loci) {
         const CSLocus* cloc = locbin->cloc;
         Datum const*const* d = locbin->d;


=====================================
src/gstacks.cc
=====================================


=====================================
src/gzFasta.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2013-2017, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2013-2021, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -31,28 +31,44 @@ void
 GzFasta::open(const char *path)
 {
     this->gz_fh = gzopen(path, "rb");
+
     if (!this->gz_fh) {
         cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
         exit(EXIT_FAILURE);
     }
+
     #if ZLIB_VERNUM >= 0x1240
     gzbuffer(this->gz_fh, libz_buffer_size);
     #endif
-    int first = gzgetc(gz_fh);
-    if (first == -1) {
+
+    char *res = NULL;
+
+    // Skip blank lines and/or comments.
+    do {
+        res = gzgets(this->gz_fh, this->line, max_len);
+    } while (res != NULL && !gzeof(this->gz_fh) && (this->line[0] == '\n' || this->line[0] == '\r' || this->line[0] == '#'));
+
+    if (res == NULL && !gzeof(this->gz_fh)) {
         int errnum;
-        const char* errstr = gzerror(gz_fh, &errnum);
+        const char* errstr = gzerror(this->gz_fh, &errnum);
         cerr << "Error: Failed to read any content from '" << path
             << "' (gzerror: " << errstr << ").\n";
         exit(EXIT_FAILURE);
-    } else if (first != '>') {
+
+    } else if (this->line[0] != '>' && !gzeof(this->gz_fh)) {
         cerr << "Error: '" << path << "': not in fasta format (expected '>', got 0x"
-             << std::hex << first << " '" << flush << (char) first << "')."
+             << std::hex << this->line[0] << " '" << flush << (char) this->line[0] << "')."
              << " (note: using zlib version " << zlibVersion()
              << "; compiled with zlib version " << ZLIB_VERSION << ")\n" << flush;
         exit(EXIT_FAILURE);
+	
+    } else if (gzeof(this->gz_fh)) {
+        cerr << "Error: '" << path << "', unknown file format.\n" << flush;
+        exit(EXIT_FAILURE);
     }
-    gzrewind(gz_fh);
+
+    memset(this->line, '\0', max_len);
+    gzrewind(this->gz_fh);
 }
 
 Seq *
@@ -77,7 +93,7 @@ GzFasta::next_seq(Seq &s)
 
     //
     // Check the contents of the line buffer. When we finish reading a FASTA record
-    // the buffer will either contain whitespace or the header of the next FAST
+    // the buffer will either contain whitespace or the header of the next FASTA
     // record.
     //
     while (this->line[0] != '>' && !gzeof(this->gz_fh)) {


=====================================
src/gzFasta.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2013-2017, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2013-2021, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //


=====================================
src/locus.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-OA
 //
-// Copyright 2013-2018, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2013-2021, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -70,6 +70,23 @@ Locus::Locus(const Locus &templ)
 
 }
 
+uint
+Locus::sort_bp(uint col0) const
+{
+    if (this->loc.strand == strand_plus) {
+	return this->loc.bp + col0;
+    } else {
+	// If the locus is aligned to the negative strand and longer than
+	// the contig the returned column will be negative and will wrap
+	// around as an unsigned integer.
+	if (col0 > this->loc.bp) {
+	    // cerr << "locus::sort_bp; ID: " << this->id << "; strand: " << (this->loc.strand == strand_plus? "plus" : "minus") << "; col: " << col0 << "; bp: " << this->loc.bp << "; diff: " << this->loc.bp - col0 << "\n";
+	    DOES_NOT_HAPPEN;
+	} else
+	    return this->loc.bp - col0;
+    }
+}
+
 int
 Locus::snp_index(uint col) const
 {


=====================================
src/locus.h
=====================================
@@ -108,7 +108,7 @@ class Locus {
             delete [] reads[i];
     }
     uint sort_bp() const {return this->loc.bp;}
-    uint sort_bp(uint col0) const {return this->loc.bp + (this->loc.strand==strand_plus ? col0 : -col0);}
+    uint sort_bp(uint col0) const;
     int snp_index(uint) const;
     int add_consensus(const char *);
     int add_model(const char *);


=====================================
src/models.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010-2012, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2021, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //


=====================================
src/populations.cc
=====================================
@@ -477,6 +477,12 @@ BatchLocusProcessor::init(string in_path, string pmap_path)
     else
         this->init_stacks_loci(in_path, pmap_path);
 
+    //
+    // Check for the existence of a contig definition file (for aligned, or
+    // integrated-aligned data) and read it in if found.
+    //
+    this->parse_contig_definitions(in_path);
+    
     //
     // Initialize our per-population haplotype counters.
     //
@@ -486,6 +492,69 @@ BatchLocusProcessor::init(string in_path, string pmap_path)
     return 0;
 }
 
+int
+BatchLocusProcessor::parse_contig_definitions(string in_path)
+{
+    char line[max_len];
+
+    string path = in_path + "/catalog.chrs.tsv";
+
+    //
+    // Check if the file is present.
+    //
+    struct stat s;
+    if (stat(path.c_str(), &s) < 0)
+	return 0;
+    
+    ifstream fh(path.c_str(), ifstream::in);
+
+    if (fh.fail()) {
+        cerr << "Error opening contig definition file '" << path << "'\n";
+        exit(1);
+    }
+
+    vector<string> parts;
+    int  cnt = 0;
+    uint line_num = 0;
+    while (fh.getline(line, max_len)) {
+        ++line_num;
+
+        //
+        // Skip blank & commented lines ; correct windows-style line ends.
+        //
+        size_t len = strlen(line);
+        if (len == 0) {
+            continue;
+        } else if (line[len-1] == '\r') {
+            line[len-1] = '\0';
+            --len;
+            if (len == 0)
+                continue;
+        }
+        char* p = line;
+        while (isspace(*p) && *p != '\0')
+            ++p;
+        if (*p == '#')
+            continue;
+
+        //
+        // Parse the contig defintion line, we expect:
+        // <contig name><tab><contig length>
+        //
+        parse_tsv(line, parts);
+
+        if (parts.size() != 2) {
+            cerr << "Error: Incorrect number of columns in contig definition file '" << path << "' at line " << line_num << "\n";
+            exit(1);
+        }
+
+        this->_mpopi->add_contig(parts[0], parts[1]);
+	cnt++;
+    }
+
+    return cnt;
+}
+
 size_t
 BatchLocusProcessor::next_batch(ostream &log_fh)
 {
@@ -496,7 +565,8 @@ BatchLocusProcessor::next_batch(ostream &log_fh)
         return this->next_batch_stacks_loci(log_fh);
 }
 
-void BatchLocusProcessor::batch_clear()
+void
+BatchLocusProcessor::batch_clear()
 {
     for (LocBin* loc : this->_loci)
         delete loc;
@@ -1526,19 +1596,20 @@ CatalogDists::accumulate_pre_filtering(const size_t sample_cnt, const CSLocus *l
 int
 CatalogDists::accumulate(const vector<LocBin *> &loci)
 {
-    const CSLocus *loc;
+    const CSLocus *cloc;
     const LocTally *t;
+    Datum   **d;
     size_t missing;
 
     for (uint i = 0; i < loci.size(); i++) {
-        loc = loci[i]->cloc;
+        cloc = loci[i]->cloc;
 
-        if (this->_post_valid.count(loc->cnt) == 0)
-            this->_post_valid[loc->cnt] = 1;
+        if (this->_post_valid.count(cloc->cnt) == 0)
+            this->_post_valid[cloc->cnt] = 1;
         else
-            this->_post_valid[loc->cnt]++;
+            this->_post_valid[cloc->cnt]++;
 
-        missing = loci[i]->sample_cnt - loc->cnt;
+        missing = loci[i]->sample_cnt - cloc->cnt;
 
         if (this->_post_absent.count(missing) == 0)
             this->_post_absent[missing] = 1;
@@ -1550,7 +1621,7 @@ CatalogDists::accumulate(const vector<LocBin *> &loci)
         //
         t = loci[i]->s->meta_pop();
         size_t n_actual_snps = 0;
-        for (const SNP* snp : loc->snps)
+        for (const SNP* snp : cloc->snps)
             if (!t->nucs[snp->col].fixed)
                 ++n_actual_snps;
 
@@ -1558,13 +1629,38 @@ CatalogDists::accumulate(const vector<LocBin *> &loci)
             this->_post_snps_per_loc[n_actual_snps] = 1;
         else
             this->_post_snps_per_loc[n_actual_snps]++;
+
+        //
+        // Record which individuals are missing loci and/or SNPs.
+        //
+        d = loci[i]->d;
+
+        for (uint j = 0; j < this->_sample_cnt; j++) {
+            if (d[j] == NULL) {
+                this->_loci_miss_per_sample[j]++;
+                this->_sites_miss_per_sample[j] += n_actual_snps;
+                continue;
+            }
+            this->_loci_pres_per_sample[j]++;
+
+            for (const SNP* snp : cloc->snps) {
+                if (t->nucs[snp->col].fixed)
+                    continue;
+                if (snp->col >= (uint) d[j]->len || d[j]->model[snp->col] == 'U')
+                    this->_sites_miss_per_sample[j]++;
+                else
+                    this->_sites_pres_per_sample[j]++;
+            }
+        }
+        this->_total_sites += n_actual_snps;
     }
+    this->_total_loci += loci.size();
 
     return 0;
 }
 
 int
-CatalogDists::write_results(ostream &log_fh)
+CatalogDists::write_results(ostream &log_fh, const MetaPopInfo *mpopi)
 {
     map<size_t, size_t>::iterator cnt_it;
     string section;
@@ -1618,6 +1714,32 @@ CatalogDists::write_results(ostream &log_fh)
         log_fh << cnt_it->first << "\t" << cnt_it->second << "\n";
     end_section();
 
+    const vector<Sample> &s = mpopi->samples();
+    
+    begin_section("loci_per_sample");
+    log_fh << "# Number of loci per individual sample (after filtering).\n"
+           << "sample\tn_loci\tpresent_loci\tmissing_loci\tfrequency_missing\n";
+    for (uint i = 0; i < this->_sample_cnt; i++)
+        log_fh << s[i].name << "\t"
+               << this->_total_loci << "\t"
+               << this->_loci_pres_per_sample[i] << "\t"
+               << this->_loci_miss_per_sample[i] << "\t"
+               << fixed << setprecision(4)
+               << (double) this->_loci_miss_per_sample[i] / (double) this->_total_loci << "\n";
+    end_section();
+
+    begin_section("variant_sites_per_sample");
+    log_fh << "# Number of variant sites per individual sample (after filtering).\n"
+           << "sample\tn_sites\tpresent_sites\tmissing_sites\tfrequency_missing\n";
+    for (uint i = 0; i < this->_sample_cnt; i++)
+        log_fh << s[i].name << "\t"
+               << this->_total_sites << "\t"
+               << this->_sites_pres_per_sample[i] << "\t"
+               << this->_sites_miss_per_sample[i] << "\t"
+               << fixed << setprecision(4)
+               << (double) this->_sites_miss_per_sample[i] / (double) this->_total_sites << "\n";
+    end_section();
+
     return 0;
 }
 


=====================================
src/populations.h
=====================================
@@ -22,10 +22,11 @@
 #define __POPULATIONS_H__
 
 #ifdef _OPENMP
-#include <omp.h>    // OpenMP library
+#include <omp.h>      // OpenMP library
 #endif
-#include <getopt.h> // Process command-line options
-#include <dirent.h> // Open/Read contents of a directory
+#include <getopt.h>   // Process command-line options
+#include <dirent.h>   // Open/Read contents of a directory
+#include <sys/stat.h> // Check for existence of files
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
@@ -104,11 +105,85 @@ class CatalogDists {
     map<size_t, size_t> _pre_valid,  _pre_absent,  _pre_confounded;  // Prior to any filtering.
     map<size_t, size_t> _post_valid, _post_absent, _post_confounded; // After filtering.
     map<size_t, size_t> _pre_snps_per_loc, _post_snps_per_loc;
+    
+    size_t _pop_cnt, _sample_cnt;
+    size_t _total_sites, _total_loci;
+    size_t *_pop_order, *_samples;
+    size_t *_sites_pres_per_sample, *_sites_miss_per_sample;
+    size_t *_loci_pres_per_sample,  *_loci_miss_per_sample;
 
 public:
+    CatalogDists() {
+        this->_pop_cnt    = 0;
+        this->_sample_cnt = 0;
+        this->_pop_order  = NULL; // The array order of each population.
+        this->_samples    = NULL; // Which population each sample belongs to.
+
+        this->_sites_pres_per_sample = NULL; // Array of samples tallying # variant sites present
+        this->_sites_miss_per_sample = NULL; // Array of samples tallying # variant sites missing
+        this->_loci_pres_per_sample  = NULL; // Array of samples tallying # loci present
+        this->_loci_miss_per_sample  = NULL; // Array of samples tallying # loci missing
+
+        this->_total_loci  = 0;
+        this->_total_sites = 0;
+    }
+    CatalogDists(const MetaPopInfo *mpopi) {
+        assert(mpopi != NULL);
+
+        this->_pop_cnt    = mpopi->pops().size();
+        this->_sample_cnt = mpopi->n_samples();
+
+        assert(this->_pop_cnt > 0);
+        assert(this->_sample_cnt > 0);
+
+        this->_pop_order = new size_t [this->_pop_cnt];
+        this->_samples   = new size_t [this->_sample_cnt];
+        this->_sites_pres_per_sample = new size_t [this->_sample_cnt];
+        this->_sites_miss_per_sample = new size_t [this->_sample_cnt];
+        this->_loci_pres_per_sample  = new size_t [this->_sample_cnt];
+        this->_loci_miss_per_sample  = new size_t [this->_sample_cnt];
+
+        memset(this->_sites_pres_per_sample, 0, this->_sample_cnt * sizeof(size_t));
+        memset(this->_sites_miss_per_sample, 0, this->_sample_cnt * sizeof(size_t));
+        memset(this->_loci_pres_per_sample,  0, this->_sample_cnt * sizeof(size_t));
+        memset(this->_loci_miss_per_sample,  0, this->_sample_cnt * sizeof(size_t));
+        
+        this->_total_loci  = 0;
+        this->_total_sites = 0;
+
+        size_t pop_sthg = 0;
+
+        for (size_t i_pop = 0; i_pop < mpopi->pops().size(); ++i_pop) {
+            const Pop& pop = mpopi->pops()[i_pop];
+
+            for (uint i = pop.first_sample; i <= pop.last_sample; i++)
+                this->_samples[i] = pop_sthg;
+            this->_pop_order[pop_sthg] = i_pop;
+
+            pop_sthg++;
+        }
+    }
+    ~CatalogDists() {
+        if (this->_pop_order != NULL)
+            delete [] this->_pop_order;
+        if (this->_samples != NULL)
+            delete [] this->_samples;
+        if (this->_sites_pres_per_sample != NULL)
+            delete [] this->_sites_pres_per_sample;
+        if (this->_sites_miss_per_sample != NULL)
+            delete [] this->_sites_miss_per_sample;
+        if (this->_loci_pres_per_sample != NULL)
+            delete [] this->_loci_pres_per_sample;
+        if (this->_loci_miss_per_sample != NULL)
+            delete [] this->_loci_miss_per_sample;
+    }
+    // Remove the copy and move constructors.
+    CatalogDists(const CatalogDists &) = delete;
+    CatalogDists(CatalogDists &&) = delete;
+
     int accumulate_pre_filtering(const size_t, const CSLocus *);
     int accumulate(const vector<LocBin *> &);
-    int write_results(ostream &log_fh);
+    int write_results(ostream &log_fh, const MetaPopInfo *mpopi);
 };
 
 //
@@ -254,18 +329,22 @@ private:
 class BatchLocusProcessor {
 public:
     BatchLocusProcessor():
+	_sig_hwe_dev(NULL),
         _input_mode(InputMode::stacks2), _user_supplied_whitelist(false), _batch_size(0), _batch_num(0),
         _mpopi(NULL), _next_loc(NULL), _unordered_bp(0) {}
     BatchLocusProcessor(InputMode mode, size_t batch_size, MetaPopInfo *popi):
+	_sig_hwe_dev(NULL),
         _input_mode(mode), _user_supplied_whitelist(false), _batch_size(batch_size), _batch_num(0),
-        _mpopi(popi), _next_loc(NULL), _unordered_bp(0) {}
+        _mpopi(popi), _next_loc(NULL), _dists(popi), _unordered_bp(0) {}
     BatchLocusProcessor(InputMode mode, size_t batch_size):
+	_sig_hwe_dev(NULL),
         _input_mode(mode), _user_supplied_whitelist(false), _batch_size(batch_size), _batch_num(0),
         _mpopi(NULL), _next_loc(NULL), _unordered_bp(0) {}
     ~BatchLocusProcessor() {
         for (uint i = 0; i < this->_loci.size(); i++)
             delete this->_loci[i];
-        delete [] this->_sig_hwe_dev;
+	if (this->_sig_hwe_dev != NULL)
+	    delete [] this->_sig_hwe_dev;
     };
 
     int            init(string, string);
@@ -273,7 +352,7 @@ public:
     size_t         next_batch_number() { return this->_batch_num + 1; }
     int            summarize(ostream &);
     int            hapstats(ostream &);
-    int            write_distributions(ostream &log_fh) { return this->_dists.write_results(log_fh); }
+    int            write_distributions(ostream &log_fh) { return this->_dists.write_results(log_fh, this->_mpopi); }
 
     MetaPopInfo*   pop_info()      { return this->_mpopi; }
     int            pop_info(MetaPopInfo *popi) { this->_mpopi = popi; return 0; }
@@ -290,7 +369,7 @@ public:
     const CatalogDists&     dists()  { return this->_dists; }
 
     // Per-population haplotype counters
-    size_t           *_sig_hwe_dev;
+    size_t      *_sig_hwe_dev;
 
 private:
     InputMode    _input_mode;
@@ -300,16 +379,16 @@ private:
     MetaPopInfo *_mpopi;      // Population Map
 
     // Parsers
-    VcfParser     _vcf_parser;
-    VcfCLocReader _cloc_reader;
-    GzFasta       _fasta_reader;
+    VcfParser      _vcf_parser;
+    VcfCLocReader  _cloc_reader;
+    GzFasta        _fasta_reader;
     vector<size_t> _samples_vcf_to_mpopi;
 
     // Data stores
-    vector<LocBin *>  _loci;
-    LocBin           *_next_loc;
-    string            _chr;
-    CatalogDists      _dists;
+    vector<LocBin *> _loci;
+    LocBin          *_next_loc;
+    string           _chr;
+    CatalogDists     _dists;
 
     // Counters for external VCF
     size_t            _total_ext_vcf;
@@ -325,6 +404,7 @@ private:
 private:
     int    init_external_loci(string, string);
     int    init_stacks_loci(string, string);
+    int    parse_contig_definitions(string);
     void   batch_clear();
     size_t next_batch_external_loci(ostream &);
     size_t next_batch_stacks_loci(ostream &);


=====================================
src/process_radtags.cc
=====================================
@@ -927,7 +927,15 @@ print_results(int argc, char **argv,
 
     init_log(log, argc, argv);
 
-    log << "File\t"
+    log << "\n"
+	<< "# Note: Individual distributions can be extracted using the `stacks-dist-extract` utility.\n"
+	<< "#       e.g. `stacks-dist-extract "
+	<< log_path.c_str() + (log_path.find_last_of('/') + 1)
+	<<" dist_name`\n";
+
+    log << "\n"
+	<< "BEGIN per_file_raw_read_counts" << "\n"
+	<< "File\t"
         << "Retained Reads\t";
     if (filter_illumina)
         log << "Illumina Filtered\t";
@@ -950,7 +958,8 @@ print_results(int argc, char **argv,
             << it->second["noradtag"]    << "\t"
             << it->second["total"]       << "\n";
     }
-
+    log << "END per_file_raw_read_counts" << "\n";
+    
     map<string, long> c;
     c["total"]        = 0;
     c["low_quality"]  = 0;
@@ -988,7 +997,8 @@ print_results(int argc, char **argv,
     print_nreads(c["noradtag"], "RAD cutsite not found drops");
     print_nreads(c["retained"], "retained reads");
 
-    log        << "\n"
+    log  << "\n"
+	 << "BEGIN total_raw_read_counts" << "\n"
          << "Total Sequences\t"      << c["total"]       << "\n";
     if (filter_illumina)
         log << "Failed Illumina filtered reads\t" << c["ill_filtered"] << "\n";
@@ -997,7 +1007,8 @@ print_results(int argc, char **argv,
     log << "Barcode Not Found\t"     << c["ambiguous"]   << "\n"
         << "Low Quality\t"           << c["low_quality"] << "\n"
         << "RAD Cutsite Not Found\t" << c["noradtag"]    << "\n"
-        << "Retained Reads\t"        << c["retained"]    << "\n";
+        << "Retained Reads\t"        << c["retained"]    << "\n"
+	<< "END total_raw_read_counts" << "\n";
 
     if (max_bc_size_1 == 0) return 0;
 
@@ -1015,13 +1026,14 @@ print_results(int argc, char **argv,
     // Print out barcode information.
     //
     log << "\n"
-        << "Barcode\t";
+        << "BEGIN per_barcode_raw_read_counts" << "\n"
+	<< "Barcode\t";
     if (bc_names)
         log << "Filename\t";
     log << "Total\t"
-        << "NoRadTag\t"
-        << "LowQuality\t"
-        << "Retained\n";
+        << "RAD Cutsite Not Found\t"
+        << "Low Quality\t"
+        << "Retained Reads\n";
 
     set<BarcodePair> barcode_list;
 
@@ -1041,8 +1053,10 @@ print_results(int argc, char **argv,
                 << barcode_log[barcodes[i]]["retained"] << "\n";
     }
 
-    log << "\n"
-        << "Sequences not recorded\n"
+    log << "END per_barcode_raw_read_counts" << "\n"
+	<< "\n"
+        << "BEGIN barcodes_not_recorded" << "\n"
+	<< "# A list and count of barcodes found that were not specified in the barcodes input file.\n"
         << "Barcode\t"
         << "Total\n";
 
@@ -1063,6 +1077,8 @@ print_results(int argc, char **argv,
             << bcs[i].second << "\n";
     }
 
+    
+    log << "END barcodes_not_recorded" << "\n";
     log.close();
 
     return 0;



View it on GitLab: https://salsa.debian.org/med-team/stacks/-/compare/4a6139a7ac614a947168a87fd345048133ce1ecf...5c517ee9e9fce91eac9f44f0e9b103eed48351da

-- 
View it on GitLab: https://salsa.debian.org/med-team/stacks/-/compare/4a6139a7ac614a947168a87fd345048133ce1ecf...5c517ee9e9fce91eac9f44f0e9b103eed48351da
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210714/fa51e73e/attachment-0001.htm>


More information about the debian-med-commit mailing list