[med-svn] [Git][med-team/stacks][upstream] New upstream version 2.66+dfsg

Fri Jan 19 16:12:24 GMT 2024


Lance Lin pushed to branch upstream at Debian Med / stacks


Commits:
60c4eb63 by Lance Lin at 2024-01-19T21:03:50+07:00
New upstream version 2.66+dfsg
- - - - -


23 changed files:

- ChangeLog
- Makefile.am
- configure.ac
- scripts/stacks-dist-extract
- scripts/stacks-integrate-alignments
- + src/FaidxI.h
- src/FastaI.h
- src/PopSum.cc
- src/Vcf.cc
- src/Vcf.h
- src/clean.cc
- src/clean.h
- src/clone_filter.cc
- src/clone_filter.h
- src/file_io.cc
- src/file_io.h
- src/gstacks.cc
- src/gstacks.h
- src/populations.cc
- src/process_radtags.cc
- src/renz.cc
- src/renz.h
- src/stacks.h


Changes:

=====================================
ChangeLog
=====================================
@@ -1,3 +1,21 @@
+Stacks 2.66 - December 5, 2023
+------------------------------
+    Feature: Rewrote stacks-dist-extract in Python including new support for partial
+             section names, streaming capability, and other improvements.
+    Feature: Included new stacks-private-alleles script that will extract private allele
+             data from the populations program outputs and output useful summaries and
+             prepare it for plotting.
+    Bugfix: In clone_filter, users sometimes specified a single oligo sequence on
+            the paired read, but the length of that oligo with --oligo-len-2 instead
+            of --oligo-len-1. Added code to use oligo length from either parameter
+            when a single sequence is specified.
+    Bugfix: private allele summary count in populations.log could be
+            incorrect, values in populations.sumstats.tsv were not affected.
+    Bugfix: when running in parallel with paired-end reads and retaining discarded
+            reads, process_radtags could segfault. Corrected threads writing to discard files.
+            Updated naming of discard output file.
+    Bugfix: corrected two small memory access errors in process_radtags.
+
 Stacks 2.65 - August 18, 2023
 -----------------------------
     Feature: Added a "properly paired" reads counter to process_radtags.


=====================================
Makefile.am
=====================================
@@ -24,6 +24,7 @@ libcore_a_SOURCES = \
 	src/DNASeq.h src/DNASeq.cc \
 	src/FastaI.h \
 	src/FastqI.h \
+	src/FaidxI.h \
 	src/GappedAln.h \
 	src/gzFasta.h src/gzFasta.cc \
 	src/gzFastq.h \


=====================================
configure.ac
=====================================
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.59)
-AC_INIT([Stacks], [2.65])
+AC_INIT([Stacks], [2.66])
 AC_CONFIG_AUX_DIR([config])
 AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
 AC_CONFIG_SRCDIR([src/ustacks.cc])


=====================================
scripts/stacks-dist-extract
=====================================
@@ -1,42 +1,272 @@
-#!/usr/bin/env bash
-
-usage="\
-Usage:
-  $(basename "$0") DIST_FILE
-  $(basename "$0") [--pretty] DIST_FILE SECTION_NAME
-
-(1) Prints available sections/distributions for DIST_FILE.
-(2) Extracts section SECTION_NAME from DIST_FILE.
-"
-
-if { [[ $# -ne 1 ]] && [[ $# -ne 2 ]] && [[ $# -ne 3 ]]; } || [[ "$1" =~ ^(-h|--help|--version)$ ]] ;then
-    echo -n "$usage" >&2
-    exit 1
-fi
-
-if [[ "$1" =~ ^--pretty$ ]]; then 
-    dist_f="$2"
-    sect="$3"
-else
-    dist_f="$1"
-    sect="$2"
-fi
-
-[[ -e "$dist_f" ]] || { ls -- "$dist_f"; exit 1; }
-
-if [[ $# -ge 2 ]] ;then
-    section="$(echo "$sect" | grep -oE '\w+' | tr -d '\n')"
-    if ! grep --quiet "^BEGIN $section\\b" "$dist_f" ;then
-        echo "Error: Couldn't find section '$section' in '$dist_f'." >&1
-        exit 1
-    fi
-
-    if [[ "$1" =~ ^--pretty$ ]]; then 
-	sed -nE "/^END $section($| )/ q; /^BEGIN $section($| )/,$ p" "$dist_f" | sed 1d | sed -nE "/^#/p"
-	sed -nE "/^END $section($| )/ q; /^BEGIN $section($| )/,$ p" "$dist_f" | sed 1d | grep -v "^#" | column -s "	" -t
-    else
-	sed -nE "/^END $section($| )/ q; /^BEGIN $section($| )/,$ p" "$dist_f" | sed 1d
-    fi
-else
-    grep ^BEGIN "$dist_f" | cut -c7-
-fi
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023, Julian Catchen <jcatchen at illinois.edu>
+#
+# This file is part of Stacks.
+#
+# Stacks is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Stacks is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import argparse
+import os, sys
+import gzip
+
+#
+# Global configuration variables.
+#
+in_path  = ""
+out_path = ""
+section  = ""
+pretty   = False
+
+def parse_command_line():
+    global in_path
+    global out_path
+    global section
+    global pretty
+
+    usage = '''
+    %(prog)s logfile [section]
+    %(prog)s [--pretty] [--out-path path] logfile [section]
+    cat logfile | %(prog)s [--pretty] [--section section]'''
+
+    desc  = '''Export a paricular section of a Stacks log or distribs file. If you \
+    supply a log path alone, %(prog)s will print the available sections to output. \
+    The log file can also be supplied via stdin.'''
+
+    p = argparse.ArgumentParser(description=desc, usage=usage)
+
+    #
+    # Add options.
+    #
+    p.add_argument("-p", "--pretty", action="store_true",
+                   help="Output data as a table with columns lined up.")
+    p.add_argument("-o", "--out-path", type=str, metavar='path',
+                   help="Path to output file.")
+    p.add_argument("-s", "--section", type=str, metavar='section', 
+                   help="Name of section to output from the log file.")
+    
+    #
+    # Parse the command line
+    #
+    args, posargs = p.parse_known_args()
+
+    if args.pretty != None:
+        pretty = args.pretty
+    if args.section != None:
+        section = args.section
+    if args.out_path != None:
+        out_path = args.out_path
+
+    if len(posargs) > 0:
+        in_path = posargs[0]
+    if len(posargs) > 1 and args.section == None:
+        section = posargs[1]
+
+    #
+    # Test input file path.
+    #
+    if len(in_path) > 0 and os.path.exists(in_path) == False:
+        print("Input log file path does not exist ('{}')".format(in_path), file=sys.stderr)
+        p.print_help()
+        sys.exit()
+
+
+def scan_log_file(fh):
+    sections = []
+
+    for line in fh:
+        if line[0:6] == "BEGIN ":
+            section = line[6:]
+            section = section.strip("\n")
+            sections.append(section)
+    fh.seek(0)
+
+    return sections
+
+
+def find_section(sections, section):
+    section_hash = {}
+    for s in sections:
+        section_hash[s] = 0
+
+    if section in section_hash:
+        return section
+
+    slen = len(section)
+
+    for s in section_hash:
+        if s[0:slen] == section:
+            section_hash[s] += 1
+
+    sorted_sections = sorted(section_hash.items(), key=lambda x:x[1], reverse=True)
+
+    s = ""
+    if sorted_sections[0][1] == 0:
+        print("Section '{}' was not found.".format(section), file=sys.stderr)
+        
+    elif sorted_sections[0][1] == 1 and sorted_sections[1][1] == 0:
+        s = sorted_sections[0][0]
+        
+    elif sorted_sections[0][1] > 0 and sorted_sections[1][1] > 0:
+        print("Section '{}' is ambiguous and could refer to more than one section of the log:".format(section), file=sys.stderr)
+        for i in range(len(sorted_sections)):
+            if sorted_sections[i][1] > 0:
+                print("    " + sorted_sections[i][0], file=sys.stderr)
+            else:
+                break
+
+    return s
+
+
+def pretty_print(stream):
+    colcnts = []
+    for line in stream:
+        cols = line.split('\t')
+
+        while len(colcnts) < len(cols):
+            colcnts.append(0)
+            
+        for i in range(len(cols)):
+            if cols[i][0] != "#" and colcnts[i] < len(cols[i]):
+                colcnts[i] = len(cols[i])
+
+    s = ""
+    for line in stream:
+        if line[0] == '#':
+            s += line + "\n"
+            continue
+        
+        cols = line.split('\t')
+        for i in range(len(cols)):
+            s += str(cols[i]).ljust(colcnts[i]) + "  "
+        s += "\n"
+
+    return s
+
+
+def output_section(in_fh, out_fh, sections, section):
+
+    start_output = False
+    s = []
+
+    for line in in_fh:
+        line = line.strip("\n")
+
+        if start_output == False and line == "BEGIN " + section:
+            start_output = True
+            continue
+        
+        if start_output == True:
+            if line != "END " + section:
+                s.append(line)
+            else:
+                if pretty == True:
+                    s = pretty_print(s)
+                    out_fh.write(s)
+                else:
+                    out_fh.write("\n".join(s) + "\n")
+                return
+
+
+def stream_section(in_fh, out_fh, section):
+
+    start_output = False
+    s = []
+
+    for line in in_fh:
+        line = line.strip("\n")
+
+        if len(section) == 0 and line[0:6] == "BEGIN ":
+            print(line[6:])
+            continue
+        
+        if start_output == False and line == "BEGIN " + section:
+            start_output = True
+            continue
+        
+        if start_output == True:
+            if line != "END " + section:
+                s.append(line)
+            else:
+                if pretty == True:
+                    s = pretty_print(s)
+                    out_fh.write(s)
+                else:
+                    out_fh.write("\n".join(s) + "\n")
+                return
+
+    if len(section) > 0 and start_output == False:
+        print("Section '{}' was not found.".format(section), file=sys.stderr)
+        return
+
+
+def main():
+    parse_command_line()
+
+    #
+    # Open input log file, if no file path is given, assume stdin.
+    #
+    in_fh  = None
+    out_fh = None
+
+    if len(in_path) > 0:
+        if in_path[-3:] == ".gz":
+            in_fh = gzip.open(in_path, 'rb')
+        else:
+            in_fh = open(in_path)
+    else:
+        in_fh = sys.stdin
+
+    #
+    # Set output file handle
+    #
+    if len(out_path) > 0:
+        out_fh = open(out_path)
+    else:
+        out_fh = sys.stdout
+
+    if in_fh == sys.stdin:
+        stream_section(in_fh, out_fh, section)
+    else:
+        #
+        # Scan the log file for section headings
+        #
+        sections = scan_log_file(in_fh)
+
+        if len(sections) == 0:
+            print("No printable sections in file, '{}'.".format(in_path))
+            return
+
+        if len(section) == 0:
+            for s in sections:
+                out_fh.write(s + "\n")
+            return
+
+        #
+        # If a section was supplied, but it is ambiguous, try to match it.
+        #
+        found_section = find_section(sections, section)
+
+        if len(found_section) == 0:
+            return
+    
+        output_section(in_fh, out_fh, sections, found_section)
+
+
+#                                                                              #
+#------------------------------------------------------------------------------#
+#                                                                              #
+if __name__ == "__main__":
+    main()


=====================================
scripts/stacks-integrate-alignments
=====================================
@@ -127,11 +127,11 @@ def parse_command_line():
     #
     # Add options.
     #
-    p.add_argument("-P","--in-path", type=str, dest="stacks_path", required=True,
+    p.add_argument("-P","--in-path", type=str, dest="stacks_path", required=True, metavar="path",
                    help="Path to a directory containing Stacks ouput files.")
-    p.add_argument("-B","--bam-path", type=str, dest="bam_path", required=True,
+    p.add_argument("-B","--bam-path", type=str, dest="bam_path", required=True, metavar="path",
                    help="Path to a SAM or BAM file containing alignment of de novo catalog loci to a reference genome.")
-    p.add_argument("-O","--out-path", type=str, dest="out_path", required=True,
+    p.add_argument("-O","--out-path", type=str, dest="out_path", required=True, metavar="path",
                    help="Path to write the integrated ouput files.")
     p.add_argument("-q", "--min_mapq", type=int,
                    help="Minimum mapping quality as listed in the BAM file (default 20).")


=====================================
src/FaidxI.h
=====================================
@@ -0,0 +1,202 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2023, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef __FAIDXI_H__
+#define __FAIDXI_H__
+
+#include "input.h"
+
+class Faidx: public Input {
+    map<string, long> index;
+    string buf;
+
+public:
+    Faidx(const char *path) : Input(path) {
+        ifstream idx_fh;
+        string   index_path = string(path) + ".fai";
+        
+        //
+        // Open and parse the index file for the FASTA.
+        //
+        idx_fh.open(index_path, ifstream::in);
+
+        if (idx_fh.fail()) {
+            cerr << "Error opening FASTA index file: '" << index_path << "'\n";
+            throw exception();
+        }
+        
+        vector<string> parts;
+        long line = 1;
+        do {
+            idx_fh.getline(this->line, max_len);
+
+            if (idx_fh.eof())
+                break;
+            
+            parse_tsv(this->line, parts);
+            if (parts.size() != 5) {
+                cerr << "Unable to parse FASTA index file, error on line " << line << "\n";
+                throw exception();
+            }
+            this->index[parts[0]] = is_integer(parts[2].c_str());
+            line++;
+        } while (idx_fh.good());
+        
+        cerr << "Parsed '" << index_path << "', found " << this->index.size() << " chromosomes.\n";
+        idx_fh.close();
+        
+        //
+        // Open the input FASTA file.
+        //
+        if (this->fh.fail())
+            cerr << "Error opening input file '" << path << "'\n";
+        if (fh.peek() != '>') {
+            cerr << "Error: '" << path << "': not in fasta format (expected '>').\n";
+            throw exception();
+        }
+    };
+
+    Faidx(string path) : Faidx(path.c_str()) { };
+    ~Faidx() {};
+    long offset(string chr) { return this->index.count(chr) > 0 ? this->index[chr] : -1; }
+    Seq *next_seq();
+    int  next_seq(Seq &);
+    int  read_seq(string, Seq &);
+};
+
+inline
+Seq *Faidx::next_seq()
+{
+    return NULL;
+}
+
+inline
+int Faidx::read_seq(string name, Seq &s)
+{
+    this->buf.clear();
+
+    //
+    // Jump to the proper offset in the file.
+    //
+    long offs = this->index[name];
+    if (offs < 0) {
+        cerr << "Unable to find entry for '" + name + "' in FASTA index list.\n";
+        throw exception();
+    }
+    this->fh.seekg(offs, this->fh.beg);
+        
+    //
+    // Read the sequence from the file -- keep reading lines until we reach the next
+    // record or the end of file.
+    //
+    this->fh.getline(this->line, max_len);
+
+    size_t len;
+    while (this->line[0] != '>' && this->fh.good()) {
+        len = strlen(this->line);
+        if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+        this->buf    += this->line;
+        this->line[0] = '\0';
+        this->fh.getline(this->line, max_len);
+    }
+
+    if (this->fh.eof()) {
+        len = strlen(this->line);
+        if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+        this->buf += this->line;
+    }
+
+    s.reserve(buf.length(), false);
+    strcpy(s.seq, this->buf.c_str());
+
+    return 1;
+}
+
+inline
+int Faidx::next_seq(Seq &s)
+{
+    this->buf.clear();
+
+    //
+    // Check the contents of the line buffer. When we finish reading a FASTA record
+    // the buffer will either contain whitespace or the header of the next FASTA
+    // record.
+    //
+    while (this->line[0] != '>' && this->fh.good() ) {
+        this->fh.getline(this->line, max_len);
+    }
+
+    if (!this->fh.good()) {
+        return 0;
+    }
+
+    //
+    // Check if there is a carraige return in the buffer
+    //
+    uint len = strlen(this->line);
+    if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+    //
+    // Check if the ID line of the FASTA file has a comment after the ID.
+    //
+    char* q = this->line + 1;
+    ++q;
+    while (*q != '\0' && *q != ' ' && *q != '\t')
+        ++q;
+    if (*q != '\0') {
+        // Comment present.
+        *q = '\0';
+        ++q;
+        s.comment.assign(q);
+    }
+    assert(s.id != NULL);
+    strcpy(s.id, this->line + 1);
+
+    //
+    // Read the sequence from the file -- keep reading lines until we reach the next
+    // record or the end of file.
+    //
+    this->fh.getline(this->line, max_len);
+
+    while (this->line[0] != '>' && this->fh.good()) {
+        len = strlen(this->line);
+        if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+        this->buf    += this->line;
+        this->line[0] = '\0';
+        this->fh.getline(this->line, max_len);
+    }
+
+    if (this->fh.eof()) {
+        len = strlen(this->line);
+        if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+        this->buf += this->line;
+    }
+
+    s.reserve(buf.length(), false);
+    strcpy(s.seq, this->buf.c_str());
+
+    return 1;
+}
+
+#endif // __FAIDXI_H__


=====================================
src/FastaI.h
=====================================
@@ -70,7 +70,7 @@ int Fasta::next_seq(Seq &s)
     }
 
     if (!this->fh.good()) {
-        return false;
+        return 0;
     }
 
     //
@@ -120,7 +120,7 @@ int Fasta::next_seq(Seq &s)
     s.reserve(buf.length(), false);
     strcpy(s.seq, this->buf.c_str());
 
-    return true;
+    return 1;
 }
 
 #endif // __FASTAI_H__


=====================================
src/PopSum.cc
=====================================
@@ -545,13 +545,13 @@ LocPopSum::tally_metapop(const CSLocus *cloc)
 
         if (p_cnt == 1 && q_cnt > 1) {
             for (uint j = 0; j < this->_pop_cnt; j++)
-                if (s[j]->nucs[col].p_nuc() == mp->nucs[col].p_allele ||
-                    s[j]->nucs[col].q_nuc() == mp->nucs[col].p_allele)
+                if ((s[j]->nucs[col].p_nuc() != 0 && s[j]->nucs[col].p_nuc() == mp->nucs[col].p_allele) ||
+                    (s[j]->nucs[col].q_nuc() != 0 && s[j]->nucs[col].q_nuc() == mp->nucs[col].p_allele))
                     variable_pop = j;
         } else if (p_cnt > 1 && q_cnt == 1) {
             for (uint j = 0; j < this->_pop_cnt; j++)
-                if (s[j]->nucs[col].p_nuc() == mp->nucs[col].q_allele ||
-                    s[j]->nucs[col].q_nuc() == mp->nucs[col].q_allele)
+                if ((s[j]->nucs[col].p_nuc() != 0 && s[j]->nucs[col].p_nuc() == mp->nucs[col].q_allele) ||
+                    (s[j]->nucs[col].q_nuc() != 0 && s[j]->nucs[col].q_nuc() == mp->nucs[col].q_allele))
                     variable_pop = j;
         } else if (p_cnt == 1 && q_cnt == 1) {
 	    //


=====================================
src/Vcf.cc
=====================================
@@ -34,6 +34,7 @@ const VcfMeta VcfMeta::predefs::format_GL ("FORMAT","<ID=GL,Number=G,Type=Float,
 const VcfMeta VcfMeta::predefs::format_GQ ("FORMAT","<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
 const VcfMeta VcfMeta::predefs::format_GT ("FORMAT","<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
 
+const VcfMeta VcfMeta::predefs::format_MY ("FORMAT","<ID=MY,Number=1,Type=String,Description=\"Methylation Annotation\">");
 const VcfMeta VcfMeta::predefs::info_loc_strand (
         "INFO",
         "<ID=loc_strand,Number=1,Type=Character,Description=\"Genomic strand the corresponding Stacks locus aligns on\">"
@@ -62,6 +63,7 @@ void VcfHeader::add_std_meta(const string& version) {
     add_meta(VcfMeta::predefs::format_GL);
     add_meta(VcfMeta::predefs::format_GQ);
     add_meta(VcfMeta::predefs::format_GT);
+    add_meta(VcfMeta::predefs::format_MY);
 }
 
 void VcfHeader::add_std_meta_with_contigs(const vector<Contig> &contigs, const string& version) {
@@ -88,6 +90,8 @@ void VcfHeader::add_std_meta_with_contigs(const vector<Contig> &contigs, const s
     add_meta(VcfMeta::predefs::format_GL);
     add_meta(VcfMeta::predefs::format_GQ);
     add_meta(VcfMeta::predefs::format_GT);
+    add_meta(VcfMeta::predefs::format_MY);
+
 }
 
 void VcfRecord::assign(const char* rec, size_t len, const VcfHeader& header) {


=====================================
src/Vcf.h
=====================================
@@ -53,6 +53,9 @@ public:
         static const VcfMeta format_GT;
         static const VcfMeta format_HQ;
 
+        // Custom for methylation annotations.
+        static const VcfMeta format_MY;
+
         // Custom.
         static const VcfMeta info_loc_strand;
     };


=====================================
src/clean.cc
=====================================
@@ -466,11 +466,13 @@ check_quality_scores(RawRead *href, int qual_offset, int score_limit, int len_li
     }
     // cerr << "\n";
 
+    const int *p, *q;
     double mean        = 0.0;
     double working_sum = 0.0;
-    int *p, *q, j;
+    int j;
 
     // cerr << "Window length: " << href->win_len << "; Stop position: " << href->stop_pos << "\n";
+    
     //
     // Populate the sliding window.
     //
@@ -487,7 +489,7 @@ check_quality_scores(RawRead *href, int qual_offset, int score_limit, int len_li
     j = offset;
 
     // cerr << "Setting pointers; P: " << (href->int_scores + offset) - href->int_scores << "; Q: " << p + (int) href->win_len - p << "; J: " << j << "\n";
-
+    
     do {
         mean = working_sum / href->win_len;
 


=====================================
src/clean.h
=====================================
@@ -280,7 +280,7 @@ public:
             this->win_len = 1;
 
         this->len     += this->inline_bc_len;
-        this->stop_pos = this->len - this->win_len;
+        this->stop_pos = this->len - this->win_len - 1;
 
         return 0;
     }


=====================================
src/clone_filter.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2011-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -1279,6 +1279,8 @@ int parse_command_line(int argc, char* argv[]) {
     switch(barcode_type) {
     case null_index:
     case null_inline:
+	if (oligo_len_1 == 0 && oligo_len_2 > 0)
+	    oligo_len_1 = oligo_len_2;
     case inline_null:
     case index_null:
 	if (oligo_len_1 > 0 && oligo_len_2 == 0) {


=====================================
src/clone_filter.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2011-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //


=====================================
src/file_io.cc
=====================================
@@ -934,12 +934,14 @@ build_file_list(vector<pair<string, string> > &files)
             // Check the file suffix to make sure we should process it.
             //
             p   = file.c_str();
-            q   = p + file.length() + 1;
+            q   = p + file.length();
             end = q;
             while (q >= p && *q != '.') q--;
-            if (strcmp(q, ".gz") == 0) {
+            if (strcmp(q, ".gz") == 0 && q > p) {
                 end = q;
-                while (q >= p && *q != '.') q--;
+                do {
+                    q--;
+                } while (q >= p && *q != '.');
             }
             if (strncmp(q, ".fq",    end - q) != 0 &&
                 strncmp(q, ".fa",    end - q) != 0 &&
@@ -969,6 +971,8 @@ build_file_list(vector<pair<string, string> > &files)
         if (files.size() == 0) {
             cerr << "Unable to locate any input files to process within '" << in_path_1 << "'\n";
         }
+        closedir(dir);
+        
     } else {
         //
         // Files specified directly:
@@ -1002,3 +1006,31 @@ build_file_list(vector<pair<string, string> > &files)
 
     return 0;
 }
+
+int
+file_suffix_position(string filename)
+{
+    const char *p, *q, *end;
+    
+    //
+    // Check the file suffix to make sure we should process it.
+    //
+    p   = filename.c_str();
+    q   = p + filename.length();
+    end = q;
+    while (q >= p && *q != '.') q--;
+    if (strcmp(q, ".gz") == 0 && q > p) {
+        end = q;
+        do {
+            q--;
+        } while (q >= p && *q != '.');
+    }
+    if (strncmp(q, ".fq",    end - q) == 0 ||
+        strncmp(q, ".fa",    end - q) == 0 ||
+        strncmp(q, ".fastq", end - q) == 0 ||
+        strncmp(q, ".fasta", end - q) == 0 ||
+        strncmp(q, ".bam",   end - q) == 0)
+        return q - p;
+    else
+        return end - p;
+}


=====================================
src/file_io.h
=====================================
@@ -95,5 +95,6 @@ int    open_files(vector<pair<string, string> > &,
                   map<string, map<string, long> > &);
 int    close_file_handles(map<BarcodePair, ofstream *> &);
 int    close_file_handles(map<BarcodePair, gzFile *> &);
+int    file_suffix_position(string filename);
 
 #endif // __FILE_IO_H__


=====================================
src/gstacks.cc
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2017-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2017-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -57,6 +57,8 @@ bool   bam_output         = false;
 bool   detailed_output    = false;
 bool   rm_unpaired_reads  = false;
 bool   rm_pcr_duplicates  = false;
+bool   call_methyl_status = false;
+string reference_path;
 
 modelt model_type = marukilow;
 unique_ptr<const Model> model;
@@ -241,6 +243,13 @@ try {
         o_hapgraphs_f << o_hapgraphs_header;
     }
 
+    //
+    // If calling methylated bases, open the reference genome FASTA.
+    //
+    Faidx *reference_genome = NULL;
+    if (call_methyl_status)
+        reference_genome = new Faidx(reference_path);
+    
     //
     // Store a list of contigs found in reference-aligned data.
     //
@@ -280,7 +289,7 @@ try {
     bool eof = false;
     #pragma omp parallel num_threads(num_threads)
     { try {
-        LocusProcessor loc_proc (bam_mpopi->samples().size(), bam_cloc_reader.get());
+            LocusProcessor loc_proc (bam_mpopi->samples().size(), bam_cloc_reader.get(), reference_genome);
         Timers& t = loc_proc.timers();
 
         CLocReadSet loc (*bam_mpopi); // For denovo.
@@ -1254,6 +1263,15 @@ LocusProcessor::process(CLocAlnSet& aln_loc)
     }
     timers_.genotyping.update();
 
+    //
+    // Call methylated bases, if requested.
+    //
+    vector<SiteMethylCounts> methylcalls;
+    if (call_methyl_status) {
+        methylcalls.reserve(aln_loc.ref().length());
+        call_methylated_status(aln_loc, calls, methylcalls);
+    }
+
     //
     // Call haplotypes; amend SNP/genotype calls.
     //
@@ -1290,7 +1308,7 @@ LocusProcessor::process(CLocAlnSet& aln_loc)
     //
     // Create the outputs.
     //
-    write_one_locus(aln_loc, depths, calls, phase_data);
+    write_one_locus(aln_loc, depths, calls, phase_data, methylcalls);
 
     if (detailed_output) {
         loc_.details_ss << "END locus " << loc_.id << "\n";
@@ -2409,11 +2427,219 @@ bool PhaseSet::is_edge_consistent(size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j
     return !crossed_edge;
 }
 
+void LocusProcessor::call_methylated_status (
+        const CLocAlnSet& aln_loc,
+        const vector<SiteCall>& calls,
+        vector<SiteMethylCounts>& methylcalls
+) {
+    const PhyLoc loc = aln_loc.pos();
+    //
+    // Check if we have the reference sequence for this locus yet.
+    //
+    if (this->ref_chrs_.count(loc.chr()) == 0) {
+        #pragma omp critical
+        {
+            if (this->ref_chrs_.count(loc.chr()) == 0) {
+                Seq seq;
+                this->ref_genome_->read_seq(loc.chr(), seq);
+                this->ref_chrs_[loc.chr()] = string(seq.seq);
+            }
+        }
+    }
+
+    char *ref = new char[aln_loc.ref().length() + 1];
+    const char *r = this->ref_chrs_[loc.chr()].c_str() + loc.bp;
+    cerr << "Length: " << aln_loc.ref().length() << "\n";
+    //
+    // Pull a copy of the locus from the reference genome.
+    //
+    if (loc.strand == strand_type::strand_plus) {
+        strncpy(ref, r, aln_loc.ref().length());
+    } else {
+        // If the locus is on the negative strand
+        r = r - (aln_loc.ref().length() - 1);
+        strncpy(ref, r, aln_loc.ref().length());
+        rev_comp_inplace(ref);
+    }
+
+    DNASeq4 consensus (aln_loc.ref().length());
+    for (size_t i = 0; i < aln_loc.ref().length(); ++i) {
+        if (!calls[i].alleles().empty()) {
+            consensus.set(i, calls[i].most_frequent_allele());
+        }
+    //     // } else {
+    //     //     pair<size_t,Nt2> nt = depths[i].tot.sorted()[0];
+    //     //     if (nt.first > 0)
+    //     //         consensus.set(i, nt.second);
+    //     // }
+    }
+    
+    //
+    // Adjust the reference genome sequence to match the assembled contig.
+    //
+    GappedAln aligner;
+    AlignRes  aln_res;
+    aligner.init(aln_loc.ref().length(), aln_loc.ref().length());
+    aligner.align(string(ref), consensus.str());
+    string cigar_str = invert_cigar(aligner.result().cigar);
+    cerr << "CIGAR: " << cigar_str << "\n";
+    cerr << "REF:   " << string(ref) << "\n";
+    cerr << "LOCUS: " << consensus.str() << "\n";
+    
+    // cerr << "RAD locus is at " << loc.chr() << "; "
+    //      << (loc.strand == strand_type::strand_plus ? loc.bp : loc.bp - aln_loc.ref().length() - 1) << ": "
+    //      << string(ref, aln_loc.ref().length()) << "\n"
+    //      << "Length: " << aln_loc.ref().length() << "\n";;
+
+    size_t tot_samples = aln_loc.mpopi().samples().size();
+    //
+    // Iterate over each SNP call and compare the bases with the imported reference genome sequence.
+    //
+    size_t end = calls.size() - 1;
+    for (size_t i = 0; i < calls.size(); i++) {
+
+        const vector<SampleCall> sample_calls = calls[i].sample_calls();
+
+        // Initialize the array of calls at this site.
+        methylcalls.push_back(SiteMethylCounts());
+        for (size_t j = 0; j < tot_samples; j++)
+            methylcalls[i].samples.push_back(methylt::irrelevant);
+
+        //
+        // No useful data at this site.
+        //
+        if (calls[i].alleles().empty())
+            continue;
+
+        if (ref[i] != 'G')
+            continue;
+
+        //
+        // Call unmethylated status of site along with nearby context:
+        //  CG:  'C' followed directly by a 'G'; also referred to as CpG or mCG
+        //  CHG: 'C' followed by an H: 'A', 'T', or 'C', followed by a 'G'
+        //  CHH: 'C' followed by H ('A', 'T', or 'C'), followed by H ('A', 'T', or 'C')
+        //
+        // With EM-Seq pulling from the negative strand:
+        //   We will see the 'C' as a 'G', converted by EM-seq to an 'A'
+        //
+        
+        if (i < end - 2) {
+            // Are there two additional sites left in the locus, so we can call CHH and CHG?
+            vector<bool> site_1(tot_samples, false);
+            vector<bool> site_2(tot_samples, false);
+            vector<bool> site_3(tot_samples, false);
+            
+            sites_equal_nucleotide(calls[i],     'A', site_1);
+            sites_equal_nucleotide(calls[i + 1], 'C', site_2);
+            sites_equal_nucleotide(calls[i + 2], 'C', site_3);
+
+            for (size_t j = 0; j < tot_samples; j++) {
+                if (site_1[j] == true && site_2[j] == false && site_3[j] == true)
+                    methylcalls[i].samples[j] = methylt::unmethyl_chg;
+                else if (site_1[j] == true && site_2[j] == false && site_3[j] == false)
+                    methylcalls[i].samples[j] = methylt::unmethyl_chh;
+                else
+                    methylcalls[i].samples[j] = methylt::unmethyl_cpg;
+            }
+
+        } else if (i < end - 1) {
+            //
+            // Is there one additional site left in the locus, so we can call CpG?
+            // If so:
+            //   Test for unmethlyated CpG: a C nucleotide followed directly by a G
+            //
+            vector<bool> site_1(tot_samples, false);
+            vector<bool> site_2(tot_samples, false);
+            
+            sites_equal_nucleotide(calls[i],     'A', site_1);
+            sites_equal_nucleotide(calls[i + 1], 'C', site_2);
+
+            for (size_t j = 0; j < tot_samples; j++) {
+                if (site_1[j] == true && site_2[j] == true)
+                    methylcalls[i].samples[j] = methylt::unmethyl_cpg;
+                else if (site_1[j] == true)
+                    methylcalls[i].samples[j] = methylt::unmethyl_unk;
+            }
+        } else {
+            //
+            // This is the last site in the locus, or it is surrounded by uncalled nucleotides ('N'),
+            // cannot determine additional methylation context.
+            //
+            vector<bool> site_1(tot_samples, false);
+            
+            sites_equal_nucleotide(calls[i], 'A', site_1);
+
+            for (size_t j = 0; j < tot_samples; j++)
+                if (site_1[j] == true)
+                    methylcalls[i].samples[j] = methylt::unmethyl_unk;
+        }
+    }
+
+    // for (size_t i = 0; i < methylcalls.size(); i++) {
+    //     cerr << "Column: " << i << "\n";
+    //     cerr << "  Ref[" << i << "]: " << ref[i] << "; called: " << calls[i].most_frequent_allele() << "\n";
+    //     cerr << "  Sample calls:\n";
+    //     for (size_t j = 0; j < tot_samples; j++) {
+    //         if (methylcalls[i].samples[j] == methylt::unmethyl_unk)
+    //             cerr << "      Sample[" << j << "]: unmethlyated unknown\n";
+    //         else if (methylcalls[i].samples[j] == methylt::unmethyl_cpg)
+    //             cerr << "      Sample[" << j << "]: unmethlyated CpG\n";
+    //         else if (methylcalls[i].samples[j] == methylt::unmethyl_chg)
+    //             cerr << "      Sample[" << j << "]: unmethlyated CHG\n";
+    //         else if (methylcalls[i].samples[j] == methylt::unmethyl_chh)
+    //             cerr << "      Sample[" << j << "]: unmethlyated CHH\n";
+    //     }
+    // }
+
+    delete [] ref;
+
+    return;
+}
+
+inline
+void LocusProcessor::sites_equal_nucleotide(const SiteCall &call, char nuc, vector<bool> &site_vals)
+{
+    if (call.alleles().size() == 1) {
+        // Fixed site
+        if (call.most_frequent_allele() == nuc)
+            for (size_t i = 0; i < site_vals.size(); i++)
+                site_vals[i] = true;
+        
+    } else {
+        const vector<SampleCall> &sample_calls = call.sample_calls();
+    
+        // Variable site
+        map<Nt2,double>::const_iterator a_1 = call.alleles().begin();
+        map<Nt2,double>::const_iterator a_2 = a_1++;
+        if (a_1->first != nuc && a_2->first != nuc)
+            return;
+        
+        for (size_t i = 0; i < sample_calls.size(); i++) {
+            switch(sample_calls[i].call()) {
+            case snp_type_hom:
+                if (sample_calls[i].nt0() == nuc)
+                    site_vals[i] = true;
+                break;
+            case snp_type_het:
+                if (sample_calls[i].nt0() == nuc || sample_calls[i].nt1() == nuc)
+                    site_vals[i] = true;
+                break;
+            case snp_type_unk:
+            case snp_type_discarded:
+            default:
+                break;
+            }
+        }
+    }
+}
+
 void LocusProcessor::write_one_locus (
         const CLocAlnSet& aln_loc,
         const vector<SiteCounts>& depths,
         const vector<SiteCall>& calls,
-        const vector<map<size_t,PhasedHet>>& phase_data
+        const vector<map<size_t,PhasedHet>>& phase_data,
+        const vector<SiteMethylCounts>& methylcalls
 ) {
     char loc_id[16];
     sprintf(loc_id, "%d", loc_.id);
@@ -2431,8 +2657,10 @@ void LocusProcessor::write_one_locus (
     stringstream vcf_records;
     VcfRecord rec;
     for (size_t i=0; i<ref.length(); ++i) {
-        const SiteCounts& sitedepths = depths[i];
-        const SiteCall& sitecall = calls[i];
+        const SiteCounts&       sitedepths = depths[i];
+        const SiteCall&         sitecall   = calls[i];
+        const SiteMethylCounts& methylcall = methylcalls[i];
+        
         if (sitecall.alleles().empty())
             // No useful data at this site.
             continue;
@@ -2503,15 +2731,48 @@ void LocusProcessor::write_one_locus (
             }
             // Format.
             rec.append_format("DP");
-            // Genotypes.
-            for (size_t sample=0; sample<mpopi.samples().size(); ++sample) {
+            if (call_methyl_status)
+                rec.append_format("MY");
+
+            for (size_t sample = 0; sample < mpopi.samples().size(); ++sample) {
+                stringstream genotype;
+                
+                // Genotypes.
                 size_t dp = sitedepths.samples[sample].sum();
-                if (dp == 0) {
-                    rec.append_sample(".");
-                    continue;
+                if (dp > 0) {
+                    genotype << dp;
+                    ++sample_sites_w_data[sample];
+                } else {
+                    genotype << ".";
                 }
-                ++sample_sites_w_data[sample];
-                rec.append_sample(to_string(dp));
+
+                if (call_methyl_status) {
+                    // Methylation calls
+                    genotype << ":";
+
+                    if (dp == 0)
+                        genotype << ".";
+                    else
+                        switch(methylcall.samples[sample]) {
+                        case methylt::unmethyl_cpg:
+                            genotype << "z";
+                            break;
+                        case methylt::unmethyl_chg:
+                            genotype << "x";
+                            break;
+                        case methylt::unmethyl_chh:
+                            genotype << "h";
+                            break;
+                        case methylt::unmethyl_unk:
+                            genotype << "u";
+                            break;
+                        case methylt::irrelevant:
+                        default:
+                            genotype << ".";
+                            break;
+                        }
+                }
+                rec.append_sample(genotype.str());
             }
 
         } else {
@@ -2548,6 +2809,8 @@ void LocusProcessor::write_one_locus (
             rec.append_format("DP");
             rec.append_format("AD");
             rec.append_format("GL");
+            if (call_methyl_status)
+                rec.append_format("MY");
 
             // Genotypes.
             for (size_t sample : mpopi.sample_indexes_orig_order()) {
@@ -2616,6 +2879,31 @@ void LocusProcessor::write_one_locus (
                     genotype << ":";
                     join(sdepths.arr(), ',', genotype);
                 }
+
+                if (call_methyl_status) {
+                    // Methylation calls
+                    genotype << ":";
+
+                    switch(methylcall.samples[sample]) {
+                    case methylt::unmethyl_cpg:
+                        genotype << "z";
+                        break;
+                    case methylt::unmethyl_chg:
+                        genotype << "x";
+                        break;
+                    case methylt::unmethyl_chh:
+                        genotype << "h";
+                        break;
+                    case methylt::unmethyl_unk:
+                        genotype << "u";
+                        break;
+                    case methylt::irrelevant:
+                    default:
+                        genotype << ".";
+                        break;
+                    }
+                }
+                 
                 // Push it.
                 rec.append_sample(genotype.str());
             }
@@ -2834,6 +3122,7 @@ const string help_string = string() +
         "  --write-alignments: save read alignments (heavy BAM files)\n"
         "\n"
         "  (Reference-based mode)\n"
+        "  --methyl-status: assumes a methylation-aware molecular protocol used; calls methylated bases.\n"
         "  --min-mapq: minimum PHRED-scaled mapping quality to consider a read (default: 10)\n"
         "  --max-clipped: maximum soft-clipping level, in fraction of read length (default: 0.20)\n"
         "  --max-insert-len: maximum allowed sequencing insert length (default: 1000)\n"
@@ -2896,6 +3185,8 @@ try {
         {"max-insert-len", required_argument, NULL,  1016},
         {"rm-unpaired-reads", no_argument,  NULL,  1017},
         {"rm-pcr-duplicates", no_argument,  NULL,  1018},
+        {"methyl-status",     no_argument,  NULL,  1025},
+        {"reference",   required_argument,  NULL,  1026},
         {"phasing-cooccurrences-thr-range", required_argument, NULL, 1019},
         {"phasing-dont-prune-hets", no_argument, NULL, 1020},
         //debug options
@@ -2995,11 +3286,11 @@ try {
                 bad_args();
             }
             break;
-        case 1012: //ignore-pe-reads
+        case 1012: // ignore-pe-reads
             ignore_pe_reads = true;
             refbased_cfg.ign_pe_reads = true;
             break;
-        case 1001://kmer-length
+        case 1001: // kmer-length
             km_length = atoi(optarg);
             if (km_length < 2 || km_length > 31) {
                 cerr << "Error: Illegal -t option value '" << optarg << "' (valid range is 2-31).\n";
@@ -3011,53 +3302,53 @@ try {
                 tmp_long = std::stol(optarg);
                 if (tmp_long < 1)
                     throw exception();
-            } catch(std::exception) {
+            } catch(std::exception &e) {
                 cerr << "Error: Illegal max-debruijn-reads option value '" << optarg << "'.\n";
                 bad_args();
             }
             max_debruijn_reads = tmp_long;
             break;
-        case 1002://min-kmer-cov
+        case 1002: // min-kmer-cov
             min_km_count = atoi(optarg);
             break;
-        case 1021://write-alignments
+        case 1021: // write-alignments
             bam_output = true;
             break;
-        case 1013://details
+        case 1013: // details
             detailed_output = true;
             break;
-        case 1014://min-mapq
+        case 1014: // min-mapq
             refbased_cfg.min_mapq = stoi(optarg);
             if (refbased_cfg.min_mapq > 255) {
                 cerr << "Error: Illegal --min-mapq value '" << optarg << "'.\n";
                 bad_args();
             }
             break;
-        case 1015://max-clipped
+        case 1015: // max-clipped
             refbased_cfg.max_clipped = atof(optarg);
             if (refbased_cfg.max_clipped < 0.0 || refbased_cfg.max_clipped > 1.0) {
                 cerr << "Error: Illegal --max-clipped value '" << optarg << "'.\n";
                 bad_args();
             }
             break;
-        case 1016://max-insert
+        case 1016: // max-insert
             refbased_cfg.max_insert_refsize = stoi(optarg);
             break;
-        case 1017://rm-unpaired-reads
+        case 1017: // rm-unpaired-reads
             rm_unpaired_reads = true;
             break;
-        case 1018://rm-pcr-duplicates
+        case 1018: // rm-pcr-duplicates
             rm_pcr_duplicates = true;
             rm_unpaired_reads = true;
             // no_pcr_duplicates_measures();
             break;
-        // case 1022://pcr-duplicates-measures
+        // case 1022: // pcr-duplicates-measures
         //     pcr_duplicates_measures();
         //     break;
-        // case 1023://no-pcr-duplicates-measures
+        // case 1023: // no-pcr-duplicates-measures
         //     no_pcr_duplicates_measures();
         //     break;
-        case 1019: //phasing-cooccurrences-thr-range
+        case 1019: // phasing-cooccurrences-thr-range
             std::regex_search(optarg, tmp_cmatch, std::regex("^[0-9]+,[0-9]+$"));
             if (!tmp_cmatch.empty()) {
                 phasing_cooccurrences_thr_range.first = atoi(optarg);
@@ -3069,10 +3360,15 @@ try {
                 bad_args();
             }
             break;
-        case 1020: //phasing-dont-prune-hets
+        case 1020: // phasing-dont-prune-hets
             phasing_dont_prune_hets = true;
             break;
-
+        case 1025: // methyl-status
+            call_methyl_status = true;
+            break;
+        case 1026: // reference
+            reference_path = optarg;
+            break;
         //
         // Debug options
         //
@@ -3259,6 +3555,11 @@ try {
     if (in_bams.empty())
         DOES_NOT_HAPPEN;
 
+    if (call_methyl_status && reference_path.length() == 0) {
+        cerr << "You must specify a reference genome in FASTA format when calling methylated bases.\n";
+        bad_args();
+    }
+ 
     //
     // Write the report.
     //
@@ -3302,6 +3603,8 @@ try {
         os << "  Removing PCR duplicates.\n";
     // if (pcr_dupl_measures)
     //     os << "  PCR duplicates mitigation measures enabled.\n";
+    if (call_methyl_status)
+        os << "  Calling methylated bases using reference:` '" << reference_path << "'\n";
 
     return os.str();
 


=====================================
src/gstacks.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2017, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2017-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -26,6 +26,7 @@
 #include "SuffixTree.h"
 #include "GappedAln.h"
 #include "BamI.h"
+#include "FaidxI.h"
 #include "locus_readers.h"
 
 string parse_command_line(int argc, char* argv[]);
@@ -234,8 +235,8 @@ struct Timers {
 //
 class LocusProcessor {
 public:
-    LocusProcessor(size_t n_samples, const BamCLocReader* reader)
-        : gt_stats_(n_samples), hap_stats_(n_samples), ctg_stats_(), loc_(), bam_cloc_reader_(reader) {}
+    LocusProcessor(size_t n_samples, const BamCLocReader* reader, Faidx *ref_genome)
+        : gt_stats_(n_samples), hap_stats_(n_samples), ctg_stats_(), loc_(), bam_cloc_reader_(reader), ref_genome_(ref_genome) {}
 
     // Process a locus.
     void process(CLocReadSet& loc);
@@ -260,6 +261,9 @@ private:
     mutable LocData loc_;
     const BamCLocReader* bam_cloc_reader_;
 
+    Faidx *ref_genome_;
+    map<string, string> ref_chrs_;
+
     DNASeq4 assemble_locus_contig(
         const vector<SRead>& fw_reads,
         const vector<SRead>& pe_reads);
@@ -329,12 +333,26 @@ private:
             const size_t min_n_cooccurrences
             ) const;
 
+    //
+    // Call methylated status of each base.
+    //
+    void call_methylated_status(
+            const CLocAlnSet& aln_loc,
+            const vector<SiteCall>& calls,
+            vector<SiteMethylCounts>& methylcalls);
+
+    void sites_equal_nucleotide(
+            const SiteCall &call,
+            char nuc,
+            vector<bool> &site_vals);
+
     // Create the fasta/vcf text outputs.
     void write_one_locus (
             const CLocAlnSet& aln_loc,
             const vector<SiteCounts>& depths,
             const vector<SiteCall>& calls,
-            const vector<map<size_t,PhasedHet>>& phase_data // {col : phasedhet} maps, for all samples
+            const vector<map<size_t,PhasedHet>>& phase_data, // {col : phasedhet} maps, for all samples
+            const vector<SiteMethylCounts>& methylcalls
     );
 
     // (debug) Write a sample's haplotype graph.


=====================================
src/populations.cc
=====================================
@@ -2769,10 +2769,10 @@ SumStatsSummary::accumulate(const vector<LocBin *> &loci)
             //
             // Compile private alleles
             //
-            if (t->nucs[pos].priv_allele >= 0)
+            if (t->nucs[pos].priv_allele >= 0) {
                 _private_cnt[t->nucs[pos].priv_allele]++;
-	    
-	    else if (t->nucs[pos].priv_allele == two_priv_alleles) {
+
+	    } else if (t->nucs[pos].priv_allele == two_priv_alleles) {
 		// If there are two private alleles at this position, find the two populations they occurred in.
 		for (uint pop = 0; pop < this->_pop_cnt; pop++) {
                     s = loci[i]->s->per_pop(pop);


=====================================
src/process_radtags.cc
=====================================
@@ -52,6 +52,7 @@ barcodet barcode_type    = null_null;
 bool     retain_header   = false;
 bool     filter_adapter  = false;
 bool     bestrad         = false;
+bool     methylrad       = false;
 bool     paired          = false;
 bool     clean           = false;
 bool     quality         = false;
@@ -109,6 +110,28 @@ int main (int argc, char* argv[]) {
 
     parse_command_line(argc, argv);
 
+    //
+    // If the user speifies a methylated restriction enzyme, make sure we have its sequence.
+    //
+    if (check_radtag && methylrad) {
+	initialize_renz_methylconv(renz);
+
+	if (renz.count(renz_1) == 0) {
+	    cerr << "Unrecognized methylated restriction enzyme specified: '" << renz_1.c_str() << "'.\n";
+	    help();
+	}
+
+	if (renz_2.length() > 0 && renz.count(renz_2) == 0) {
+	    cerr << "Unrecognized methylated restriction enzyme specified: '" << renz_2.c_str() << "'.\n";
+	    help();
+	}
+
+	cerr << "Converting restriction enzyme sequence for '" << renz_1 << "'";
+	if (renz_2.length() > 0)
+	    cerr << " and '" << renz_2 << "'";
+	cerr << " to methylated version for cut site remnant checking.\n";
+    }
+    
     //
     // If input files are gzipped, output gziped files, unless the user chooses an output type.
     //
@@ -415,7 +438,11 @@ process_paired_reads(string prefix_1,
     // Open a file for recording discarded reads
     //
     if (discards) {
-        path_1 = out_path + prefix_1 + ".discards";
+        string dprefix_1 = prefix_1.substr(0, file_suffix_position(prefix_1));
+        string dprefix_2 = prefix_2.substr(0, file_suffix_position(prefix_2));
+        string dsuffix   = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
+
+        path_1 = out_path + dprefix_1 + ".discards" + dsuffix;
         discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
 
         if (discard_fh_1->fail()) {
@@ -423,7 +450,7 @@ process_paired_reads(string prefix_1,
             exit(1);
         }
 
-        path_2 = out_path + prefix_2 + ".discards";
+        path_2 = out_path + dprefix_2 + ".discards" + dsuffix;
         discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
 
         if (discard_fh_2->fail()) {
@@ -638,7 +665,7 @@ process_paired_reads_parallel(string prefix_1,
 			      map<BarcodePair, map<string, long> > &barcode_log) {
     Input    *fh_1=NULL, *fh_2=NULL;
     RawRead   *r_1=NULL,  *r_2=NULL;
-    ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
+    vector<ofstream *> discard_fhs_1(num_output_threads), discard_fhs_2(num_output_threads);
 
     int return_val = 1;
 
@@ -668,20 +695,29 @@ process_paired_reads_parallel(string prefix_1,
     // Open a file for recording discarded reads
     //
     if (discards) {
-        path_1 = out_path + prefix_1 + ".discards";
-        discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
-
-        if (discard_fh_1->fail()) {
-            cerr << "Error opening single-end discard output file '" << path_1 << "'\n";
-            exit(1);
-        }
+        string dprefix_1 = prefix_1.substr(0, file_suffix_position(prefix_1));
+        string dprefix_2 = prefix_2.substr(0, file_suffix_position(prefix_2));
+
+        string dsuffix = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
+        
+        for (uint i = 0; i < discard_fhs_1.size(); i++) {
+            string thread_str = discard_fhs_1.size() > 1 ? "." + to_string(i+1) : "";
+            
+            path_1 = out_path + dprefix_1 + thread_str + ".discards" + dsuffix;
+            discard_fhs_1[i] = new ofstream(path_1.c_str(), ifstream::out);
+
+            if (discard_fhs_1[i]->fail()) {
+                cerr << "Error opening single-end discard output file '" << path_1 << "'\n";
+                exit(1);
+            }
 
-        path_2 = out_path + prefix_2 + ".discards";
-        discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
+            path_2 = out_path + dprefix_2 + thread_str + ".discards" + dsuffix;
+            discard_fhs_2[i] = new ofstream(path_2.c_str(), ifstream::out);
 
-        if (discard_fh_2->fail()) {
-            cerr << "Error opening paired-end discard output file '" << path_2 << "'\n";
-            exit(1);
+            if (discard_fhs_2[i]->fail()) {
+                cerr << "Error opening paired-end discard output file '" << path_2 << "'\n";
+                exit(1);
+            }
         }
     }
 
@@ -890,12 +926,12 @@ process_paired_reads_parallel(string prefix_1,
 
 		if (discards && !r_1->retain)
 		    result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
-			write_fastq(discard_fh_1, s_1) :
-			write_fasta(discard_fh_1, s_1);
+			write_fastq(discard_fhs_1[thread_no - 1], s_1) :
+			write_fasta(discard_fhs_1[thread_no - 1], s_1);
 		if (discards && !r_2->retain)
 		    result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
-			write_fastq(discard_fh_2, s_2) :
-			write_fasta(discard_fh_2, s_2);
+			write_fastq(discard_fhs_2[thread_no - 1], s_2) :
+			write_fasta(discard_fhs_2[thread_no - 1], s_2);
 
 		if (!result_1 || !result_2) {
 		    cerr << "Error writing to discard file for '" << bc->str() << "'\n";
@@ -1060,8 +1096,10 @@ process_paired_reads_parallel(string prefix_1,
     consolidate_counters(counter, local_counters);
 
     if (discards) {
-        delete discard_fh_1;
-        delete discard_fh_2;
+        for (uint i = 0; i < discard_fhs_1.size(); i++) {
+            delete discard_fhs_1[i];
+            delete discard_fhs_2[i];
+        }
     }
 
     delete fh_1;
@@ -1101,7 +1139,10 @@ process_reads(string prefix,
     // Open a file for recording discarded reads
     //
     if (discards) {
-        path = out_path + prefix + ".discards";
+        string dprefix = prefix.substr(0, file_suffix_position(prefix));
+        string dsuffix = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
+
+        path = out_path + dprefix + ".discards" + dsuffix;
         discard_fh = new ofstream(path.c_str(), ifstream::out);
 
         if (discard_fh->fail()) {
@@ -1242,8 +1283,8 @@ process_reads_parallel(string prefix,
 		       map<string, long> &counter,
 		       map<BarcodePair, map<string, long>> &barcode_log)
 {
-    Input    *fh         = NULL;
-    ofstream *discard_fh = NULL;
+    Input *fh = NULL;
+    vector<ofstream *> discard_fhs(num_output_threads);
 
     int return_val = 1;
 
@@ -1262,12 +1303,18 @@ process_reads_parallel(string prefix,
     // Open a file for recording discarded reads
     //
     if (discards) {
-        path = out_path + prefix + ".discards";
-        discard_fh = new ofstream(path.c_str(), ifstream::out);
+        string dprefix = prefix.substr(0, file_suffix_position(prefix));
+        string dsuffix = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
 
-        if (discard_fh->fail()) {
-            cerr << "Error opening discard output file '" << path << "'\n";
-            exit(1);
+        for (uint i = 0; i < discard_fhs.size(); i++) {
+            string thread_str = discard_fhs.size() > 1 ? + "." + to_string(i+1) : "";
+            path = out_path + dprefix + thread_str +".discards" + dsuffix;
+            discard_fhs[i] = new ofstream(path.c_str(), ifstream::out);
+
+            if (discard_fhs[i]->fail()) {
+                cerr << "Error opening discard output file '" << path << "'\n";
+                exit(1);
+            }
         }
     }
 
@@ -1439,10 +1486,9 @@ process_reads_parallel(string prefix,
 		}
 
 		if (discards && !r->retain) {
-		    #pragma omp critical
 		    result = out_file_type == FileT::fastq || out_file_type == FileT::gzfastq ?
-			write_fastq(discard_fh, s) :
-			write_fasta(discard_fh, s);
+			write_fastq(discard_fhs[thread_no - 1], s) :
+			write_fasta(discard_fhs[thread_no - 1], s);
 		}
 		
 		if (!result) {
@@ -1572,8 +1618,12 @@ process_reads_parallel(string prefix,
     consolidate_barcode_logs(barcode_log, local_barcode_log);
     consolidate_counters(counter, local_counters);
     
-    if (discards) delete discard_fh;
-
+    if (discards) {
+        for (uint i = 0; i < discard_fhs.size(); i++) {
+            delete discard_fhs[i];
+        }
+    }
+    
     //
     // Close the file and delete the Input object.
     //
@@ -2092,9 +2142,10 @@ int parse_command_line(int argc, char* argv[]) {
             {"interleaved",          no_argument, NULL, 'I'},
             {"merge",                no_argument, NULL, 'm'},
             {"disable-rad-check",    no_argument, NULL, 'R'}, {"disable_rad_check",    no_argument, NULL, 'R'},
-            {"filter-illumina",      no_argument, NULL, 'F'}, {"filter_illumina",      no_argument, NULL, 'F'},
-            {"retain-header",        no_argument, NULL, 'H'}, {"retain_header",        no_argument, NULL, 'H'},
+            {"filter-illumina",      no_argument, NULL, 'F'},
+            {"retain-header",        no_argument, NULL, 'H'},
             {"bestrad",              no_argument, NULL, 1000},
+	    {"methylrad",            no_argument, NULL, 1002},
             {"null-index",           no_argument, NULL, 'U'}, {"null_index",           no_argument, NULL, 'U'},
             {"index-null",           no_argument, NULL, 'u'}, {"index_null",           no_argument, NULL, 'u'},
             {"inline-null",          no_argument, NULL, 'V'}, {"inline_null",          no_argument, NULL, 'V'},
@@ -2196,6 +2247,9 @@ int parse_command_line(int argc, char* argv[]) {
 	case 1001:
 	    num_threads = is_integer(optarg);
 	    break;
+	case 1002:
+            methylrad = true;
+            break;
         case 'B':
             barcode_dist_1 = is_integer(optarg);
             break;
@@ -2471,7 +2525,8 @@ void help() {
 
     cerr << "\n"
          << "  Protocol-specific options:\n"
-         << "    --bestrad: library was generated using BestRAD, check for restriction enzyme on either read and potentially tranpose reads.\n\n"
+         << "    --bestrad: library was generated using BestRAD, check for restriction enzyme on either read and potentially tranpose reads.\n"
+	 << "    --methylrad: library was generated using enzymatic methyl-seq (EM-seq) or bisulphite sequencing.\n\n"
          << "  Adapter options:\n"
          << "    --adapter-1 <sequence>: provide adaptor sequence that may occur on the single-end read for filtering.\n"
          << "    --adapter-2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"


=====================================
src/renz.cc
=====================================
@@ -142,9 +142,18 @@ const char *xbaI[]    = {"CTAGA",             // T/CTAGA, XbaI
 const char *xhoI[]    = {"TCGAG",             // C/TCGAG, XhoI
                          "CTCGA"};
 
-void
-initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len) {
+//
+// Methylation-converted restriction enzyme sequences: Gs -> As
+//
+const char *pstI_methyl[] = {"TACAA",   // CTGCA/G, PstI
+                             "TTGTA"};
+const char *sbfI_methyl[] = {"TACAAA",  // CCTGCA/GG, SbfI
+                             "TTTGTA"};
+
 
+void
+initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len)
+{
     renz["sbfI"]    = sbfI;    // CCTGCA/GG, SbfI
     renz["pstI"]    = pstI;    // CTGCA/G, PstI
     renz["pstIshi"] = pstIshi; // CTGCA/G, PstI-SHI
@@ -333,7 +342,17 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
 }
 
 void
-initialize_renz_olap(map<string, int> &renz_olap) {
+initialize_renz_methylconv(map<string, const char **> &renz)
+{
+    renz.clear();
+    
+    renz["sbfI"] = sbfI_methyl;    // CCTGCA/GG, SbfI
+    renz["pstI"] = pstI_methyl;    // CTGCA/G, PstI
+}
+
+void
+initialize_renz_olap(map<string, int> &renz_olap)
+{
     renz_olap["sbfI"]   = 4;
 }
 


=====================================
src/renz.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2011-2016, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -28,5 +28,6 @@
 
 void initialize_renz_olap(map<string, int> &renz_olap);
 void initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len);
+void initialize_renz_methylconv(map<string, const char **> &renz);
 
 #endif // __RENZ_H__


=====================================
src/stacks.h
=====================================
@@ -1,6 +1,6 @@
 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
 //
-// Copyright 2010, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
 //
 // This file is part of Stacks.
 //
@@ -43,10 +43,11 @@
 typedef unsigned int uint;
 typedef string allele_type;
 
-enum snp_type    {snp_type_het, snp_type_hom, snp_type_unk, snp_type_discarded};
-enum read_type   {primary, secondary};
-enum searcht     {sequence, genomic_loc};
-enum corr_type   {p_value, bonferroni_win, bonferroni_gen, no_correction};
+enum snp_type  {snp_type_het, snp_type_hom, snp_type_unk, snp_type_discarded};
+enum read_type {primary, secondary};
+enum searcht   {sequence, genomic_loc};
+enum corr_type {p_value, bonferroni_win, bonferroni_gen, no_correction};
+enum methylt   {unmethyl_cpg, unmethyl_chg, unmethyl_chh, unmethyl_unk, irrelevant};
 
 class SNP {
  public:
@@ -275,9 +276,14 @@ struct Read {
 };
 
 struct SiteCounts {
-    Counts<Nt2> tot; // The sum over all samples.
-    vector<Counts<Nt2>> samples; // With size() == mpopi->samples().size()/
-    const MetaPopInfo* mpopi;
+    Counts<Nt2>         tot;     // The sum over all samples.
+    vector<Counts<Nt2>> samples; // With size() == mpopi->samples().size()
+    const MetaPopInfo  *mpopi;
+};
+
+struct SiteMethylCounts {
+    vector<methylt>    samples; // With size() == mpopi->samples().size()
+    const MetaPopInfo *mpopi;
 };
 
 //



View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/60c4eb63eeee0774189faa71b77c8b2a3574c1b0

-- 
View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/60c4eb63eeee0774189faa71b77c8b2a3574c1b0
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240119/257b38a3/attachment-0001.htm>