[med-svn] [Git][med-team/stacks][upstream] New upstream version 2.66+dfsg
Lance Lin (@linqigang)
gitlab at salsa.debian.org
Fri Jan 19 16:12:24 GMT 2024
Lance Lin pushed to branch upstream at Debian Med / stacks
Commits:
60c4eb63 by Lance Lin at 2024-01-19T21:03:50+07:00
New upstream version 2.66+dfsg
- - - - -
23 changed files:
- ChangeLog
- Makefile.am
- configure.ac
- scripts/stacks-dist-extract
- scripts/stacks-integrate-alignments
- + src/FaidxI.h
- src/FastaI.h
- src/PopSum.cc
- src/Vcf.cc
- src/Vcf.h
- src/clean.cc
- src/clean.h
- src/clone_filter.cc
- src/clone_filter.h
- src/file_io.cc
- src/file_io.h
- src/gstacks.cc
- src/gstacks.h
- src/populations.cc
- src/process_radtags.cc
- src/renz.cc
- src/renz.h
- src/stacks.h
Changes:
=====================================
ChangeLog
=====================================
@@ -1,3 +1,21 @@
+Stacks 2.66 - December 5, 2023
+------------------------------
+ Feature: Rewrote stacks-dist-extract in Python including new support for partial
+ section names, streaming capability, and other improvements.
+ Feature: Included new stacks-private-alleles script that will extract private allele
+ data from the populations program outputs and output useful summaries and
+ prepare it for plotting.
+ Bugfix: In clone_filter, users sometimes specified a single oligo sequence on
+ the paired read, but the length of that oligo with --oligo-len-2 instead
+ of --oligo-len-1. Added code to use oligo length from either parameter
+ when a single sequence is specified.
+ Bugfix: private allele summary count in populations.log could be
+ incorrect, values in populations.sumstats.tsv were not affected.
+ Bugfix: when running in parallel with paired-end reads and retaining discarded
+ reads, process_radtags could segfault. Corrected threads writing to discard files.
+ Updated naming of discard output file.
+ Bugfix: corrected two small memory access errors in process_radtags.
+
Stacks 2.65 - August 18, 2023
-----------------------------
Feature: Added a "properly paired" reads counter to process_radtags.
=====================================
Makefile.am
=====================================
@@ -24,6 +24,7 @@ libcore_a_SOURCES = \
src/DNASeq.h src/DNASeq.cc \
src/FastaI.h \
src/FastqI.h \
+ src/FaidxI.h \
src/GappedAln.h \
src/gzFasta.h src/gzFasta.cc \
src/gzFastq.h \
=====================================
configure.ac
=====================================
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT([Stacks], [2.65])
+AC_INIT([Stacks], [2.66])
AC_CONFIG_AUX_DIR([config])
AM_INIT_AUTOMAKE([-Wall -Werror foreign parallel-tests subdir-objects])
AC_CONFIG_SRCDIR([src/ustacks.cc])
=====================================
scripts/stacks-dist-extract
=====================================
@@ -1,42 +1,272 @@
-#!/usr/bin/env bash
-
-usage="\
-Usage:
- $(basename "$0") DIST_FILE
- $(basename "$0") [--pretty] DIST_FILE SECTION_NAME
-
-(1) Prints available sections/distributions for DIST_FILE.
-(2) Extracts section SECTION_NAME from DIST_FILE.
-"
-
-if { [[ $# -ne 1 ]] && [[ $# -ne 2 ]] && [[ $# -ne 3 ]]; } || [[ "$1" =~ ^(-h|--help|--version)$ ]] ;then
- echo -n "$usage" >&2
- exit 1
-fi
-
-if [[ "$1" =~ ^--pretty$ ]]; then
- dist_f="$2"
- sect="$3"
-else
- dist_f="$1"
- sect="$2"
-fi
-
-[[ -e "$dist_f" ]] || { ls -- "$dist_f"; exit 1; }
-
-if [[ $# -ge 2 ]] ;then
- section="$(echo "$sect" | grep -oE '\w+' | tr -d '\n')"
- if ! grep --quiet "^BEGIN $section\\b" "$dist_f" ;then
- echo "Error: Couldn't find section '$section' in '$dist_f'." >&1
- exit 1
- fi
-
- if [[ "$1" =~ ^--pretty$ ]]; then
- sed -nE "/^END $section($| )/ q; /^BEGIN $section($| )/,$ p" "$dist_f" | sed 1d | sed -nE "/^#/p"
- sed -nE "/^END $section($| )/ q; /^BEGIN $section($| )/,$ p" "$dist_f" | sed 1d | grep -v "^#" | column -s " " -t
- else
- sed -nE "/^END $section($| )/ q; /^BEGIN $section($| )/,$ p" "$dist_f" | sed 1d
- fi
-else
- grep ^BEGIN "$dist_f" | cut -c7-
-fi
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023, Julian Catchen <jcatchen at illinois.edu>
+#
+# This file is part of Stacks.
+#
+# Stacks is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Stacks is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Stacks. If not, see <http://www.gnu.org/licenses/>.
+#
+
+import argparse
+import os, sys
+import gzip
+
+#
+# Global configuration variables.
+#
+in_path = ""
+out_path = ""
+section = ""
+pretty = False
+
+def parse_command_line():
+ global in_path
+ global out_path
+ global section
+ global pretty
+
+ usage = '''
+ %(prog)s logfile [section]
+ %(prog)s [--pretty] [--out-path path] logfile [section]
+ cat logfile | %(prog)s [--pretty] [--section section]'''
+
+ desc = '''Export a paricular section of a Stacks log or distribs file. If you \
+ supply a log path alone, %(prog)s will print the available sections to output. \
+ The log file can also be supplied via stdin.'''
+
+ p = argparse.ArgumentParser(description=desc, usage=usage)
+
+ #
+ # Add options.
+ #
+ p.add_argument("-p", "--pretty", action="store_true",
+ help="Output data as a table with columns lined up.")
+ p.add_argument("-o", "--out-path", type=str, metavar='path',
+ help="Path to output file.")
+ p.add_argument("-s", "--section", type=str, metavar='section',
+ help="Name of section to output from the log file.")
+
+ #
+ # Parse the command line
+ #
+ args, posargs = p.parse_known_args()
+
+ if args.pretty != None:
+ pretty = args.pretty
+ if args.section != None:
+ section = args.section
+ if args.out_path != None:
+ out_path = args.out_path
+
+ if len(posargs) > 0:
+ in_path = posargs[0]
+ if len(posargs) > 1 and args.section == None:
+ section = posargs[1]
+
+ #
+ # Test input file path.
+ #
+ if len(in_path) > 0 and os.path.exists(in_path) == False:
+ print("Input log file path does not exist ('{}')".format(in_path), file=sys.stderr)
+ p.print_help()
+ sys.exit()
+
+
+def scan_log_file(fh):
+ sections = []
+
+ for line in fh:
+ if line[0:6] == "BEGIN ":
+ section = line[6:]
+ section = section.strip("\n")
+ sections.append(section)
+ fh.seek(0)
+
+ return sections
+
+
+def find_section(sections, section):
+ section_hash = {}
+ for s in sections:
+ section_hash[s] = 0
+
+ if section in section_hash:
+ return section
+
+ slen = len(section)
+
+ for s in section_hash:
+ if s[0:slen] == section:
+ section_hash[s] += 1
+
+ sorted_sections = sorted(section_hash.items(), key=lambda x:x[1], reverse=True)
+
+ s = ""
+ if sorted_sections[0][1] == 0:
+ print("Section '{}' was not found.".format(section), file=sys.stderr)
+
+ elif sorted_sections[0][1] == 1 and sorted_sections[1][1] == 0:
+ s = sorted_sections[0][0]
+
+ elif sorted_sections[0][1] > 0 and sorted_sections[1][1] > 0:
+ print("Section '{}' is ambiguous and could refer to more than one section of the log:".format(section), file=sys.stderr)
+ for i in range(len(sorted_sections)):
+ if sorted_sections[i][1] > 0:
+ print(" " + sorted_sections[i][0], file=sys.stderr)
+ else:
+ break
+
+ return s
+
+
+def pretty_print(stream):
+ colcnts = []
+ for line in stream:
+ cols = line.split('\t')
+
+ while len(colcnts) < len(cols):
+ colcnts.append(0)
+
+ for i in range(len(cols)):
+ if cols[i][0] != "#" and colcnts[i] < len(cols[i]):
+ colcnts[i] = len(cols[i])
+
+ s = ""
+ for line in stream:
+ if line[0] == '#':
+ s += line + "\n"
+ continue
+
+ cols = line.split('\t')
+ for i in range(len(cols)):
+ s += str(cols[i]).ljust(colcnts[i]) + " "
+ s += "\n"
+
+ return s
+
+
+def output_section(in_fh, out_fh, sections, section):
+
+ start_output = False
+ s = []
+
+ for line in in_fh:
+ line = line.strip("\n")
+
+ if start_output == False and line == "BEGIN " + section:
+ start_output = True
+ continue
+
+ if start_output == True:
+ if line != "END " + section:
+ s.append(line)
+ else:
+ if pretty == True:
+ s = pretty_print(s)
+ out_fh.write(s)
+ else:
+ out_fh.write("\n".join(s) + "\n")
+ return
+
+
+def stream_section(in_fh, out_fh, section):
+
+ start_output = False
+ s = []
+
+ for line in in_fh:
+ line = line.strip("\n")
+
+ if len(section) == 0 and line[0:6] == "BEGIN ":
+ print(line[6:])
+ continue
+
+ if start_output == False and line == "BEGIN " + section:
+ start_output = True
+ continue
+
+ if start_output == True:
+ if line != "END " + section:
+ s.append(line)
+ else:
+ if pretty == True:
+ s = pretty_print(s)
+ out_fh.write(s)
+ else:
+ out_fh.write("\n".join(s) + "\n")
+ return
+
+ if len(section) > 0 and start_output == False:
+ print("Section '{}' was not found.".format(section), file=sys.stderr)
+ return
+
+
+def main():
+ parse_command_line()
+
+ #
+ # Open input log file, if no file path is given, assume stdin.
+ #
+ in_fh = None
+ out_fh = None
+
+ if len(in_path) > 0:
+ if in_path[-3:] == ".gz":
+ in_fh = gzip.open(in_path, 'rb')
+ else:
+ in_fh = open(in_path)
+ else:
+ in_fh = sys.stdin
+
+ #
+ # Set output file handle
+ #
+ if len(out_path) > 0:
+ out_fh = open(out_path)
+ else:
+ out_fh = sys.stdout
+
+ if in_fh == sys.stdin:
+ stream_section(in_fh, out_fh, section)
+ else:
+ #
+ # Scan the log file for section headings
+ #
+ sections = scan_log_file(in_fh)
+
+ if len(sections) == 0:
+ print("No printable sections in file, '{}'.".format(in_path))
+ return
+
+ if len(section) == 0:
+ for s in sections:
+ out_fh.write(s + "\n")
+ return
+
+ #
+ # If a section was supplied, but it is ambiguous, try to match it.
+ #
+ found_section = find_section(sections, section)
+
+ if len(found_section) == 0:
+ return
+
+ output_section(in_fh, out_fh, sections, found_section)
+
+
+# #
+#------------------------------------------------------------------------------#
+# #
+if __name__ == "__main__":
+ main()
=====================================
scripts/stacks-integrate-alignments
=====================================
@@ -127,11 +127,11 @@ def parse_command_line():
#
# Add options.
#
- p.add_argument("-P","--in-path", type=str, dest="stacks_path", required=True,
+ p.add_argument("-P","--in-path", type=str, dest="stacks_path", required=True, metavar="path",
help="Path to a directory containing Stacks ouput files.")
- p.add_argument("-B","--bam-path", type=str, dest="bam_path", required=True,
+ p.add_argument("-B","--bam-path", type=str, dest="bam_path", required=True, metavar="path",
help="Path to a SAM or BAM file containing alignment of de novo catalog loci to a reference genome.")
- p.add_argument("-O","--out-path", type=str, dest="out_path", required=True,
+ p.add_argument("-O","--out-path", type=str, dest="out_path", required=True, metavar="path",
help="Path to write the integrated ouput files.")
p.add_argument("-q", "--min_mapq", type=int,
help="Minimum mapping quality as listed in the BAM file (default 20).")
=====================================
src/FaidxI.h
=====================================
@@ -0,0 +1,202 @@
+// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
+//
+// Copyright 2023, Julian Catchen <jcatchen at illinois.edu>
+//
+// This file is part of Stacks.
+//
+// Stacks is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Stacks is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Stacks. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef __FAIDXI_H__
+#define __FAIDXI_H__
+
+#include "input.h"
+
+class Faidx: public Input {
+ map<string, long> index;
+ string buf;
+
+public:
+ Faidx(const char *path) : Input(path) {
+ ifstream idx_fh;
+ string index_path = string(path) + ".fai";
+
+ //
+ // Open and parse the index file for the FASTA.
+ //
+ idx_fh.open(index_path, ifstream::in);
+
+ if (idx_fh.fail()) {
+ cerr << "Error opening FASTA index file: '" << index_path << "'\n";
+ throw exception();
+ }
+
+ vector<string> parts;
+ long line = 1;
+ do {
+ idx_fh.getline(this->line, max_len);
+
+ if (idx_fh.eof())
+ break;
+
+ parse_tsv(this->line, parts);
+ if (parts.size() != 5) {
+ cerr << "Unable to parse FASTA index file, error on line " << line << "\n";
+ throw exception();
+ }
+ this->index[parts[0]] = is_integer(parts[2].c_str());
+ line++;
+ } while (idx_fh.good());
+
+ cerr << "Parsed '" << index_path << "', found " << this->index.size() << " chromosomes.\n";
+ idx_fh.close();
+
+ //
+ // Open the input FASTA file.
+ //
+ if (this->fh.fail())
+ cerr << "Error opening input file '" << path << "'\n";
+ if (fh.peek() != '>') {
+ cerr << "Error: '" << path << "': not in fasta format (expected '>').\n";
+ throw exception();
+ }
+ };
+
+ Faidx(string path) : Faidx(path.c_str()) { };
+ ~Faidx() {};
+ long offset(string chr) { return this->index.count(chr) > 0 ? this->index[chr] : -1; }
+ Seq *next_seq();
+ int next_seq(Seq &);
+ int read_seq(string, Seq &);
+};
+
+inline
+Seq *Faidx::next_seq()
+{
+ return NULL;
+}
+
+inline
+int Faidx::read_seq(string name, Seq &s)
+{
+ this->buf.clear();
+
+ //
+ // Jump to the proper offset in the file.
+ //
+ long offs = this->index[name];
+ if (offs < 0) {
+ cerr << "Unable to find entry for '" + name + "' in FASTA index list.\n";
+ throw exception();
+ }
+ this->fh.seekg(offs, this->fh.beg);
+
+ //
+ // Read the sequence from the file -- keep reading lines until we reach the next
+ // record or the end of file.
+ //
+ this->fh.getline(this->line, max_len);
+
+ size_t len;
+ while (this->line[0] != '>' && this->fh.good()) {
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+ this->buf += this->line;
+ this->line[0] = '\0';
+ this->fh.getline(this->line, max_len);
+ }
+
+ if (this->fh.eof()) {
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+ this->buf += this->line;
+ }
+
+ s.reserve(buf.length(), false);
+ strcpy(s.seq, this->buf.c_str());
+
+ return 1;
+}
+
+inline
+int Faidx::next_seq(Seq &s)
+{
+ this->buf.clear();
+
+ //
+ // Check the contents of the line buffer. When we finish reading a FASTA record
+ // the buffer will either contain whitespace or the header of the next FASTA
+ // record.
+ //
+ while (this->line[0] != '>' && this->fh.good() ) {
+ this->fh.getline(this->line, max_len);
+ }
+
+ if (!this->fh.good()) {
+ return 0;
+ }
+
+ //
+ // Check if there is a carraige return in the buffer
+ //
+ uint len = strlen(this->line);
+ if (this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+ //
+ // Check if the ID line of the FASTA file has a comment after the ID.
+ //
+ char* q = this->line + 1;
+ ++q;
+ while (*q != '\0' && *q != ' ' && *q != '\t')
+ ++q;
+ if (*q != '\0') {
+ // Comment present.
+ *q = '\0';
+ ++q;
+ s.comment.assign(q);
+ }
+ assert(s.id != NULL);
+ strcpy(s.id, this->line + 1);
+
+ //
+ // Read the sequence from the file -- keep reading lines until we reach the next
+ // record or the end of file.
+ //
+ this->fh.getline(this->line, max_len);
+
+ while (this->line[0] != '>' && this->fh.good()) {
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+ this->buf += this->line;
+ this->line[0] = '\0';
+ this->fh.getline(this->line, max_len);
+ }
+
+ if (this->fh.eof()) {
+ len = strlen(this->line);
+ if (len > 0 && this->line[len - 1] == '\r') this->line[len - 1] = '\0';
+
+ this->buf += this->line;
+ }
+
+ s.reserve(buf.length(), false);
+ strcpy(s.seq, this->buf.c_str());
+
+ return 1;
+}
+
+#endif // __FAIDXI_H__
=====================================
src/FastaI.h
=====================================
@@ -70,7 +70,7 @@ int Fasta::next_seq(Seq &s)
}
if (!this->fh.good()) {
- return false;
+ return 0;
}
//
@@ -120,7 +120,7 @@ int Fasta::next_seq(Seq &s)
s.reserve(buf.length(), false);
strcpy(s.seq, this->buf.c_str());
- return true;
+ return 1;
}
#endif // __FASTAI_H__
=====================================
src/PopSum.cc
=====================================
@@ -545,13 +545,13 @@ LocPopSum::tally_metapop(const CSLocus *cloc)
if (p_cnt == 1 && q_cnt > 1) {
for (uint j = 0; j < this->_pop_cnt; j++)
- if (s[j]->nucs[col].p_nuc() == mp->nucs[col].p_allele ||
- s[j]->nucs[col].q_nuc() == mp->nucs[col].p_allele)
+ if ((s[j]->nucs[col].p_nuc() != 0 && s[j]->nucs[col].p_nuc() == mp->nucs[col].p_allele) ||
+ (s[j]->nucs[col].q_nuc() != 0 && s[j]->nucs[col].q_nuc() == mp->nucs[col].p_allele))
variable_pop = j;
} else if (p_cnt > 1 && q_cnt == 1) {
for (uint j = 0; j < this->_pop_cnt; j++)
- if (s[j]->nucs[col].p_nuc() == mp->nucs[col].q_allele ||
- s[j]->nucs[col].q_nuc() == mp->nucs[col].q_allele)
+ if ((s[j]->nucs[col].p_nuc() != 0 && s[j]->nucs[col].p_nuc() == mp->nucs[col].q_allele) ||
+ (s[j]->nucs[col].q_nuc() != 0 && s[j]->nucs[col].q_nuc() == mp->nucs[col].q_allele))
variable_pop = j;
} else if (p_cnt == 1 && q_cnt == 1) {
//
=====================================
src/Vcf.cc
=====================================
@@ -34,6 +34,7 @@ const VcfMeta VcfMeta::predefs::format_GL ("FORMAT","<ID=GL,Number=G,Type=Float,
const VcfMeta VcfMeta::predefs::format_GQ ("FORMAT","<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
const VcfMeta VcfMeta::predefs::format_GT ("FORMAT","<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
+const VcfMeta VcfMeta::predefs::format_MY ("FORMAT","<ID=MY,Number=1,Type=String,Description=\"Methylation Annotation\">");
const VcfMeta VcfMeta::predefs::info_loc_strand (
"INFO",
"<ID=loc_strand,Number=1,Type=Character,Description=\"Genomic strand the corresponding Stacks locus aligns on\">"
@@ -62,6 +63,7 @@ void VcfHeader::add_std_meta(const string& version) {
add_meta(VcfMeta::predefs::format_GL);
add_meta(VcfMeta::predefs::format_GQ);
add_meta(VcfMeta::predefs::format_GT);
+ add_meta(VcfMeta::predefs::format_MY);
}
void VcfHeader::add_std_meta_with_contigs(const vector<Contig> &contigs, const string& version) {
@@ -88,6 +90,8 @@ void VcfHeader::add_std_meta_with_contigs(const vector<Contig> &contigs, const s
add_meta(VcfMeta::predefs::format_GL);
add_meta(VcfMeta::predefs::format_GQ);
add_meta(VcfMeta::predefs::format_GT);
+ add_meta(VcfMeta::predefs::format_MY);
+
}
void VcfRecord::assign(const char* rec, size_t len, const VcfHeader& header) {
=====================================
src/Vcf.h
=====================================
@@ -53,6 +53,9 @@ public:
static const VcfMeta format_GT;
static const VcfMeta format_HQ;
+ // Custom for methylation annotations.
+ static const VcfMeta format_MY;
+
// Custom.
static const VcfMeta info_loc_strand;
};
=====================================
src/clean.cc
=====================================
@@ -466,11 +466,13 @@ check_quality_scores(RawRead *href, int qual_offset, int score_limit, int len_li
}
// cerr << "\n";
+ const int *p, *q;
double mean = 0.0;
double working_sum = 0.0;
- int *p, *q, j;
+ int j;
// cerr << "Window length: " << href->win_len << "; Stop position: " << href->stop_pos << "\n";
+
//
// Populate the sliding window.
//
@@ -487,7 +489,7 @@ check_quality_scores(RawRead *href, int qual_offset, int score_limit, int len_li
j = offset;
// cerr << "Setting pointers; P: " << (href->int_scores + offset) - href->int_scores << "; Q: " << p + (int) href->win_len - p << "; J: " << j << "\n";
-
+
do {
mean = working_sum / href->win_len;
=====================================
src/clean.h
=====================================
@@ -280,7 +280,7 @@ public:
this->win_len = 1;
this->len += this->inline_bc_len;
- this->stop_pos = this->len - this->win_len;
+ this->stop_pos = this->len - this->win_len - 1;
return 0;
}
=====================================
src/clone_filter.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -1279,6 +1279,8 @@ int parse_command_line(int argc, char* argv[]) {
switch(barcode_type) {
case null_index:
case null_inline:
+ if (oligo_len_1 == 0 && oligo_len_2 > 0)
+ oligo_len_1 = oligo_len_2;
case inline_null:
case index_null:
if (oligo_len_1 > 0 && oligo_len_2 == 0) {
=====================================
src/clone_filter.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2015, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
=====================================
src/file_io.cc
=====================================
@@ -934,12 +934,14 @@ build_file_list(vector<pair<string, string> > &files)
// Check the file suffix to make sure we should process it.
//
p = file.c_str();
- q = p + file.length() + 1;
+ q = p + file.length();
end = q;
while (q >= p && *q != '.') q--;
- if (strcmp(q, ".gz") == 0) {
+ if (strcmp(q, ".gz") == 0 && q > p) {
end = q;
- while (q >= p && *q != '.') q--;
+ do {
+ q--;
+ } while (q >= p && *q != '.');
}
if (strncmp(q, ".fq", end - q) != 0 &&
strncmp(q, ".fa", end - q) != 0 &&
@@ -969,6 +971,8 @@ build_file_list(vector<pair<string, string> > &files)
if (files.size() == 0) {
cerr << "Unable to locate any input files to process within '" << in_path_1 << "'\n";
}
+ closedir(dir);
+
} else {
//
// Files specified directly:
@@ -1002,3 +1006,31 @@ build_file_list(vector<pair<string, string> > &files)
return 0;
}
+
+int
+file_suffix_position(string filename)
+{
+ const char *p, *q, *end;
+
+ //
+ // Check the file suffix to make sure we should process it.
+ //
+ p = filename.c_str();
+ q = p + filename.length();
+ end = q;
+ while (q >= p && *q != '.') q--;
+ if (strcmp(q, ".gz") == 0 && q > p) {
+ end = q;
+ do {
+ q--;
+ } while (q >= p && *q != '.');
+ }
+ if (strncmp(q, ".fq", end - q) == 0 ||
+ strncmp(q, ".fa", end - q) == 0 ||
+ strncmp(q, ".fastq", end - q) == 0 ||
+ strncmp(q, ".fasta", end - q) == 0 ||
+ strncmp(q, ".bam", end - q) == 0)
+ return q - p;
+ else
+ return end - p;
+}
=====================================
src/file_io.h
=====================================
@@ -95,5 +95,6 @@ int open_files(vector<pair<string, string> > &,
map<string, map<string, long> > &);
int close_file_handles(map<BarcodePair, ofstream *> &);
int close_file_handles(map<BarcodePair, gzFile *> &);
+int file_suffix_position(string filename);
#endif // __FILE_IO_H__
=====================================
src/gstacks.cc
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2017-2021, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2017-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -57,6 +57,8 @@ bool bam_output = false;
bool detailed_output = false;
bool rm_unpaired_reads = false;
bool rm_pcr_duplicates = false;
+bool call_methyl_status = false;
+string reference_path;
modelt model_type = marukilow;
unique_ptr<const Model> model;
@@ -241,6 +243,13 @@ try {
o_hapgraphs_f << o_hapgraphs_header;
}
+ //
+ // If calling methylated bases, open the reference genome FASTA.
+ //
+ Faidx *reference_genome = NULL;
+ if (call_methyl_status)
+ reference_genome = new Faidx(reference_path);
+
//
// Store a list of contigs found in reference-aligned data.
//
@@ -280,7 +289,7 @@ try {
bool eof = false;
#pragma omp parallel num_threads(num_threads)
{ try {
- LocusProcessor loc_proc (bam_mpopi->samples().size(), bam_cloc_reader.get());
+ LocusProcessor loc_proc (bam_mpopi->samples().size(), bam_cloc_reader.get(), reference_genome);
Timers& t = loc_proc.timers();
CLocReadSet loc (*bam_mpopi); // For denovo.
@@ -1254,6 +1263,15 @@ LocusProcessor::process(CLocAlnSet& aln_loc)
}
timers_.genotyping.update();
+ //
+ // Call methylated bases, if requested.
+ //
+ vector<SiteMethylCounts> methylcalls;
+ if (call_methyl_status) {
+ methylcalls.reserve(aln_loc.ref().length());
+ call_methylated_status(aln_loc, calls, methylcalls);
+ }
+
//
// Call haplotypes; amend SNP/genotype calls.
//
@@ -1290,7 +1308,7 @@ LocusProcessor::process(CLocAlnSet& aln_loc)
//
// Create the outputs.
//
- write_one_locus(aln_loc, depths, calls, phase_data);
+ write_one_locus(aln_loc, depths, calls, phase_data, methylcalls);
if (detailed_output) {
loc_.details_ss << "END locus " << loc_.id << "\n";
@@ -2409,11 +2427,219 @@ bool PhaseSet::is_edge_consistent(size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j
return !crossed_edge;
}
+void LocusProcessor::call_methylated_status (
+ const CLocAlnSet& aln_loc,
+ const vector<SiteCall>& calls,
+ vector<SiteMethylCounts>& methylcalls
+) {
+ const PhyLoc loc = aln_loc.pos();
+ //
+ // Check if we have the reference sequence for this locus yet.
+ //
+ if (this->ref_chrs_.count(loc.chr()) == 0) {
+ #pragma omp critical
+ {
+ if (this->ref_chrs_.count(loc.chr()) == 0) {
+ Seq seq;
+ this->ref_genome_->read_seq(loc.chr(), seq);
+ this->ref_chrs_[loc.chr()] = string(seq.seq);
+ }
+ }
+ }
+
+ char *ref = new char[aln_loc.ref().length() + 1];
+ const char *r = this->ref_chrs_[loc.chr()].c_str() + loc.bp;
+ cerr << "Length: " << aln_loc.ref().length() << "\n";
+ //
+ // Pull a copy of the locus from the reference genome.
+ //
+ if (loc.strand == strand_type::strand_plus) {
+ strncpy(ref, r, aln_loc.ref().length());
+ } else {
+ // If the locus is on the negative strand
+ r = r - (aln_loc.ref().length() - 1);
+ strncpy(ref, r, aln_loc.ref().length());
+ rev_comp_inplace(ref);
+ }
+
+ DNASeq4 consensus (aln_loc.ref().length());
+ for (size_t i = 0; i < aln_loc.ref().length(); ++i) {
+ if (!calls[i].alleles().empty()) {
+ consensus.set(i, calls[i].most_frequent_allele());
+ }
+ // // } else {
+ // // pair<size_t,Nt2> nt = depths[i].tot.sorted()[0];
+ // // if (nt.first > 0)
+ // // consensus.set(i, nt.second);
+ // // }
+ }
+
+ //
+ // Adjust the reference genome sequence to match the assembled contig.
+ //
+ GappedAln aligner;
+ AlignRes aln_res;
+ aligner.init(aln_loc.ref().length(), aln_loc.ref().length());
+ aligner.align(string(ref), consensus.str());
+ string cigar_str = invert_cigar(aligner.result().cigar);
+ cerr << "CIGAR: " << cigar_str << "\n";
+ cerr << "REF: " << string(ref) << "\n";
+ cerr << "LOCUS: " << consensus.str() << "\n";
+
+ // cerr << "RAD locus is at " << loc.chr() << "; "
+ // << (loc.strand == strand_type::strand_plus ? loc.bp : loc.bp - aln_loc.ref().length() - 1) << ": "
+ // << string(ref, aln_loc.ref().length()) << "\n"
+ // << "Length: " << aln_loc.ref().length() << "\n";;
+
+ size_t tot_samples = aln_loc.mpopi().samples().size();
+ //
+ // Iterate over each SNP call and compare the bases with the imported reference genome sequence.
+ //
+ size_t end = calls.size() - 1;
+ for (size_t i = 0; i < calls.size(); i++) {
+
+ const vector<SampleCall> sample_calls = calls[i].sample_calls();
+
+ // Initialize the array of calls at this site.
+ methylcalls.push_back(SiteMethylCounts());
+ for (size_t j = 0; j < tot_samples; j++)
+ methylcalls[i].samples.push_back(methylt::irrelevant);
+
+ //
+ // No useful data at this site.
+ //
+ if (calls[i].alleles().empty())
+ continue;
+
+ if (ref[i] != 'G')
+ continue;
+
+ //
+ // Call unmethylated status of site along with nearby context:
+ // CG: 'C' followed directly by a 'G'; also referred to as CpG or mCG
+ // CHG: 'C' followed by an H: 'A', 'T', or 'C', followed by a 'G'
+ // CHH: 'C' followed by H ('A', 'T', or 'C'), followed by H ('A', 'T', or 'C')
+ //
+ // With EM-Seq pulling from the negative strand:
+ // We will see the 'C' as a 'G', converted by EM-seq to an 'A'
+ //
+
+ if (i < end - 2) {
+ // Are there two additional sites left in the locus, so we can call CHH and CHG?
+ vector<bool> site_1(tot_samples, false);
+ vector<bool> site_2(tot_samples, false);
+ vector<bool> site_3(tot_samples, false);
+
+ sites_equal_nucleotide(calls[i], 'A', site_1);
+ sites_equal_nucleotide(calls[i + 1], 'C', site_2);
+ sites_equal_nucleotide(calls[i + 2], 'C', site_3);
+
+ for (size_t j = 0; j < tot_samples; j++) {
+ if (site_1[j] == true && site_2[j] == false && site_3[j] == true)
+ methylcalls[i].samples[j] = methylt::unmethyl_chg;
+ else if (site_1[j] == true && site_2[j] == false && site_3[j] == false)
+ methylcalls[i].samples[j] = methylt::unmethyl_chh;
+ else
+ methylcalls[i].samples[j] = methylt::unmethyl_cpg;
+ }
+
+ } else if (i < end - 1) {
+ //
+ // Is there one additional site left in the locus, so we can call CpG?
+ // If so:
+ // Test for unmethlyated CpG: a C nucleotide followed directly by a G
+ //
+ vector<bool> site_1(tot_samples, false);
+ vector<bool> site_2(tot_samples, false);
+
+ sites_equal_nucleotide(calls[i], 'A', site_1);
+ sites_equal_nucleotide(calls[i + 1], 'C', site_2);
+
+ for (size_t j = 0; j < tot_samples; j++) {
+ if (site_1[j] == true && site_2[j] == true)
+ methylcalls[i].samples[j] = methylt::unmethyl_cpg;
+ else if (site_1[j] == true)
+ methylcalls[i].samples[j] = methylt::unmethyl_unk;
+ }
+ } else {
+ //
+ // This is the last site in the locus, or it is surrounded by uncalled nucleotides ('N'),
+ // cannot determine additional methylation context.
+ //
+ vector<bool> site_1(tot_samples, false);
+
+ sites_equal_nucleotide(calls[i], 'A', site_1);
+
+ for (size_t j = 0; j < tot_samples; j++)
+ if (site_1[j] == true)
+ methylcalls[i].samples[j] = methylt::unmethyl_unk;
+ }
+ }
+
+ // for (size_t i = 0; i < methylcalls.size(); i++) {
+ // cerr << "Column: " << i << "\n";
+ // cerr << " Ref[" << i << "]: " << ref[i] << "; called: " << calls[i].most_frequent_allele() << "\n";
+ // cerr << " Sample calls:\n";
+ // for (size_t j = 0; j < tot_samples; j++) {
+ // if (methylcalls[i].samples[j] == methylt::unmethyl_unk)
+ // cerr << " Sample[" << j << "]: unmethlyated unknown\n";
+ // else if (methylcalls[i].samples[j] == methylt::unmethyl_cpg)
+ // cerr << " Sample[" << j << "]: unmethlyated CpG\n";
+ // else if (methylcalls[i].samples[j] == methylt::unmethyl_chg)
+ // cerr << " Sample[" << j << "]: unmethlyated CHG\n";
+ // else if (methylcalls[i].samples[j] == methylt::unmethyl_chh)
+ // cerr << " Sample[" << j << "]: unmethlyated CHH\n";
+ // }
+ // }
+
+ delete [] ref;
+
+ return;
+}
+
+inline
+void LocusProcessor::sites_equal_nucleotide(const SiteCall &call, char nuc, vector<bool> &site_vals)
+{
+ if (call.alleles().size() == 1) {
+ // Fixed site
+ if (call.most_frequent_allele() == nuc)
+ for (size_t i = 0; i < site_vals.size(); i++)
+ site_vals[i] = true;
+
+ } else {
+ const vector<SampleCall> &sample_calls = call.sample_calls();
+
+ // Variable site
+ map<Nt2,double>::const_iterator a_1 = call.alleles().begin();
+ map<Nt2,double>::const_iterator a_2 = a_1++;
+ if (a_1->first != nuc && a_2->first != nuc)
+ return;
+
+ for (size_t i = 0; i < sample_calls.size(); i++) {
+ switch(sample_calls[i].call()) {
+ case snp_type_hom:
+ if (sample_calls[i].nt0() == nuc)
+ site_vals[i] = true;
+ break;
+ case snp_type_het:
+ if (sample_calls[i].nt0() == nuc || sample_calls[i].nt1() == nuc)
+ site_vals[i] = true;
+ break;
+ case snp_type_unk:
+ case snp_type_discarded:
+ default:
+ break;
+ }
+ }
+ }
+}
+
void LocusProcessor::write_one_locus (
const CLocAlnSet& aln_loc,
const vector<SiteCounts>& depths,
const vector<SiteCall>& calls,
- const vector<map<size_t,PhasedHet>>& phase_data
+ const vector<map<size_t,PhasedHet>>& phase_data,
+ const vector<SiteMethylCounts>& methylcalls
) {
char loc_id[16];
sprintf(loc_id, "%d", loc_.id);
@@ -2431,8 +2657,10 @@ void LocusProcessor::write_one_locus (
stringstream vcf_records;
VcfRecord rec;
for (size_t i=0; i<ref.length(); ++i) {
- const SiteCounts& sitedepths = depths[i];
- const SiteCall& sitecall = calls[i];
+ const SiteCounts& sitedepths = depths[i];
+ const SiteCall& sitecall = calls[i];
+ const SiteMethylCounts& methylcall = methylcalls[i];
+
if (sitecall.alleles().empty())
// No useful data at this site.
continue;
@@ -2503,15 +2731,48 @@ void LocusProcessor::write_one_locus (
}
// Format.
rec.append_format("DP");
- // Genotypes.
- for (size_t sample=0; sample<mpopi.samples().size(); ++sample) {
+ if (call_methyl_status)
+ rec.append_format("MY");
+
+ for (size_t sample = 0; sample < mpopi.samples().size(); ++sample) {
+ stringstream genotype;
+
+ // Genotypes.
size_t dp = sitedepths.samples[sample].sum();
- if (dp == 0) {
- rec.append_sample(".");
- continue;
+ if (dp > 0) {
+ genotype << dp;
+ ++sample_sites_w_data[sample];
+ } else {
+ genotype << ".";
}
- ++sample_sites_w_data[sample];
- rec.append_sample(to_string(dp));
+
+ if (call_methyl_status) {
+ // Methylation calls
+ genotype << ":";
+
+ if (dp == 0)
+ genotype << ".";
+ else
+ switch(methylcall.samples[sample]) {
+ case methylt::unmethyl_cpg:
+ genotype << "z";
+ break;
+ case methylt::unmethyl_chg:
+ genotype << "x";
+ break;
+ case methylt::unmethyl_chh:
+ genotype << "h";
+ break;
+ case methylt::unmethyl_unk:
+ genotype << "u";
+ break;
+ case methylt::irrelevant:
+ default:
+ genotype << ".";
+ break;
+ }
+ }
+ rec.append_sample(genotype.str());
}
} else {
@@ -2548,6 +2809,8 @@ void LocusProcessor::write_one_locus (
rec.append_format("DP");
rec.append_format("AD");
rec.append_format("GL");
+ if (call_methyl_status)
+ rec.append_format("MY");
// Genotypes.
for (size_t sample : mpopi.sample_indexes_orig_order()) {
@@ -2616,6 +2879,31 @@ void LocusProcessor::write_one_locus (
genotype << ":";
join(sdepths.arr(), ',', genotype);
}
+
+ if (call_methyl_status) {
+ // Methylation calls
+ genotype << ":";
+
+ switch(methylcall.samples[sample]) {
+ case methylt::unmethyl_cpg:
+ genotype << "z";
+ break;
+ case methylt::unmethyl_chg:
+ genotype << "x";
+ break;
+ case methylt::unmethyl_chh:
+ genotype << "h";
+ break;
+ case methylt::unmethyl_unk:
+ genotype << "u";
+ break;
+ case methylt::irrelevant:
+ default:
+ genotype << ".";
+ break;
+ }
+ }
+
// Push it.
rec.append_sample(genotype.str());
}
@@ -2834,6 +3122,7 @@ const string help_string = string() +
" --write-alignments: save read alignments (heavy BAM files)\n"
"\n"
" (Reference-based mode)\n"
+ " --methyl-status: assumes a methylation-aware molecular protocol used; calls methylated bases.\n"
" --min-mapq: minimum PHRED-scaled mapping quality to consider a read (default: 10)\n"
" --max-clipped: maximum soft-clipping level, in fraction of read length (default: 0.20)\n"
" --max-insert-len: maximum allowed sequencing insert length (default: 1000)\n"
@@ -2896,6 +3185,8 @@ try {
{"max-insert-len", required_argument, NULL, 1016},
{"rm-unpaired-reads", no_argument, NULL, 1017},
{"rm-pcr-duplicates", no_argument, NULL, 1018},
+ {"methyl-status", no_argument, NULL, 1025},
+ {"reference", required_argument, NULL, 1026},
{"phasing-cooccurrences-thr-range", required_argument, NULL, 1019},
{"phasing-dont-prune-hets", no_argument, NULL, 1020},
//debug options
@@ -2995,11 +3286,11 @@ try {
bad_args();
}
break;
- case 1012: //ignore-pe-reads
+ case 1012: // ignore-pe-reads
ignore_pe_reads = true;
refbased_cfg.ign_pe_reads = true;
break;
- case 1001://kmer-length
+ case 1001: // kmer-length
km_length = atoi(optarg);
if (km_length < 2 || km_length > 31) {
cerr << "Error: Illegal -t option value '" << optarg << "' (valid range is 2-31).\n";
@@ -3011,53 +3302,53 @@ try {
tmp_long = std::stol(optarg);
if (tmp_long < 1)
throw exception();
- } catch(std::exception) {
+ } catch(std::exception &e) {
cerr << "Error: Illegal max-debruijn-reads option value '" << optarg << "'.\n";
bad_args();
}
max_debruijn_reads = tmp_long;
break;
- case 1002://min-kmer-cov
+ case 1002: // min-kmer-cov
min_km_count = atoi(optarg);
break;
- case 1021://write-alignments
+ case 1021: // write-alignments
bam_output = true;
break;
- case 1013://details
+ case 1013: // details
detailed_output = true;
break;
- case 1014://min-mapq
+ case 1014: // min-mapq
refbased_cfg.min_mapq = stoi(optarg);
if (refbased_cfg.min_mapq > 255) {
cerr << "Error: Illegal --min-mapq value '" << optarg << "'.\n";
bad_args();
}
break;
- case 1015://max-clipped
+ case 1015: // max-clipped
refbased_cfg.max_clipped = atof(optarg);
if (refbased_cfg.max_clipped < 0.0 || refbased_cfg.max_clipped > 1.0) {
cerr << "Error: Illegal --max-clipped value '" << optarg << "'.\n";
bad_args();
}
break;
- case 1016://max-insert
+ case 1016: // max-insert
refbased_cfg.max_insert_refsize = stoi(optarg);
break;
- case 1017://rm-unpaired-reads
+ case 1017: // rm-unpaired-reads
rm_unpaired_reads = true;
break;
- case 1018://rm-pcr-duplicates
+ case 1018: // rm-pcr-duplicates
rm_pcr_duplicates = true;
rm_unpaired_reads = true;
// no_pcr_duplicates_measures();
break;
- // case 1022://pcr-duplicates-measures
+ // case 1022: // pcr-duplicates-measures
// pcr_duplicates_measures();
// break;
- // case 1023://no-pcr-duplicates-measures
+ // case 1023: // no-pcr-duplicates-measures
// no_pcr_duplicates_measures();
// break;
- case 1019: //phasing-cooccurrences-thr-range
+ case 1019: // phasing-cooccurrences-thr-range
std::regex_search(optarg, tmp_cmatch, std::regex("^[0-9]+,[0-9]+$"));
if (!tmp_cmatch.empty()) {
phasing_cooccurrences_thr_range.first = atoi(optarg);
@@ -3069,10 +3360,15 @@ try {
bad_args();
}
break;
- case 1020: //phasing-dont-prune-hets
+ case 1020: // phasing-dont-prune-hets
phasing_dont_prune_hets = true;
break;
-
+ case 1025: // methyl-status
+ call_methyl_status = true;
+ break;
+ case 1026: // reference
+ reference_path = optarg;
+ break;
//
// Debug options
//
@@ -3259,6 +3555,11 @@ try {
if (in_bams.empty())
DOES_NOT_HAPPEN;
+ if (call_methyl_status && reference_path.length() == 0) {
+ cerr << "You must specify a reference genome in FASTA format when calling methylated bases.\n";
+ bad_args();
+ }
+
//
// Write the report.
//
@@ -3302,6 +3603,8 @@ try {
os << " Removing PCR duplicates.\n";
// if (pcr_dupl_measures)
// os << " PCR duplicates mitigation measures enabled.\n";
+ if (call_methyl_status)
+ os << " Calling methylated bases using reference:` '" << reference_path << "'\n";
return os.str();
=====================================
src/gstacks.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2017, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2017-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -26,6 +26,7 @@
#include "SuffixTree.h"
#include "GappedAln.h"
#include "BamI.h"
+#include "FaidxI.h"
#include "locus_readers.h"
string parse_command_line(int argc, char* argv[]);
@@ -234,8 +235,8 @@ struct Timers {
//
class LocusProcessor {
public:
- LocusProcessor(size_t n_samples, const BamCLocReader* reader)
- : gt_stats_(n_samples), hap_stats_(n_samples), ctg_stats_(), loc_(), bam_cloc_reader_(reader) {}
+ LocusProcessor(size_t n_samples, const BamCLocReader* reader, Faidx *ref_genome)
+ : gt_stats_(n_samples), hap_stats_(n_samples), ctg_stats_(), loc_(), bam_cloc_reader_(reader), ref_genome_(ref_genome) {}
// Process a locus.
void process(CLocReadSet& loc);
@@ -260,6 +261,9 @@ private:
mutable LocData loc_;
const BamCLocReader* bam_cloc_reader_;
+ Faidx *ref_genome_;
+ map<string, string> ref_chrs_;
+
DNASeq4 assemble_locus_contig(
const vector<SRead>& fw_reads,
const vector<SRead>& pe_reads);
@@ -329,12 +333,26 @@ private:
const size_t min_n_cooccurrences
) const;
+ //
+ // Call methylated status of each base.
+ //
+ void call_methylated_status(
+ const CLocAlnSet& aln_loc,
+ const vector<SiteCall>& calls,
+ vector<SiteMethylCounts>& methylcalls);
+
+ void sites_equal_nucleotide(
+ const SiteCall &call,
+ char nuc,
+ vector<bool> &site_vals);
+
// Create the fasta/vcf text outputs.
void write_one_locus (
const CLocAlnSet& aln_loc,
const vector<SiteCounts>& depths,
const vector<SiteCall>& calls,
- const vector<map<size_t,PhasedHet>>& phase_data // {col : phasedhet} maps, for all samples
+ const vector<map<size_t,PhasedHet>>& phase_data, // {col : phasedhet} maps, for all samples
+ const vector<SiteMethylCounts>& methylcalls
);
// (debug) Write a sample's haplotype graph.
=====================================
src/populations.cc
=====================================
@@ -2769,10 +2769,10 @@ SumStatsSummary::accumulate(const vector<LocBin *> &loci)
//
// Compile private alleles
//
- if (t->nucs[pos].priv_allele >= 0)
+ if (t->nucs[pos].priv_allele >= 0) {
_private_cnt[t->nucs[pos].priv_allele]++;
-
- else if (t->nucs[pos].priv_allele == two_priv_alleles) {
+
+ } else if (t->nucs[pos].priv_allele == two_priv_alleles) {
// If there are two private alleles at this position, find the two populations they occurred in.
for (uint pop = 0; pop < this->_pop_cnt; pop++) {
s = loci[i]->s->per_pop(pop);
=====================================
src/process_radtags.cc
=====================================
@@ -52,6 +52,7 @@ barcodet barcode_type = null_null;
bool retain_header = false;
bool filter_adapter = false;
bool bestrad = false;
+bool methylrad = false;
bool paired = false;
bool clean = false;
bool quality = false;
@@ -109,6 +110,28 @@ int main (int argc, char* argv[]) {
parse_command_line(argc, argv);
+ //
+ // If the user speifies a methylated restriction enzyme, make sure we have its sequence.
+ //
+ if (check_radtag && methylrad) {
+ initialize_renz_methylconv(renz);
+
+ if (renz.count(renz_1) == 0) {
+ cerr << "Unrecognized methylated restriction enzyme specified: '" << renz_1.c_str() << "'.\n";
+ help();
+ }
+
+ if (renz_2.length() > 0 && renz.count(renz_2) == 0) {
+ cerr << "Unrecognized methylated restriction enzyme specified: '" << renz_2.c_str() << "'.\n";
+ help();
+ }
+
+ cerr << "Converting restriction enzyme sequence for '" << renz_1 << "'";
+ if (renz_2.length() > 0)
+ cerr << " and '" << renz_2 << "'";
+ cerr << " to methylated version for cut site remnant checking.\n";
+ }
+
//
// If input files are gzipped, output gziped files, unless the user chooses an output type.
//
@@ -415,7 +438,11 @@ process_paired_reads(string prefix_1,
// Open a file for recording discarded reads
//
if (discards) {
- path_1 = out_path + prefix_1 + ".discards";
+ string dprefix_1 = prefix_1.substr(0, file_suffix_position(prefix_1));
+ string dprefix_2 = prefix_2.substr(0, file_suffix_position(prefix_2));
+ string dsuffix = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
+
+ path_1 = out_path + dprefix_1 + ".discards" + dsuffix;
discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
if (discard_fh_1->fail()) {
@@ -423,7 +450,7 @@ process_paired_reads(string prefix_1,
exit(1);
}
- path_2 = out_path + prefix_2 + ".discards";
+ path_2 = out_path + dprefix_2 + ".discards" + dsuffix;
discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
if (discard_fh_2->fail()) {
@@ -638,7 +665,7 @@ process_paired_reads_parallel(string prefix_1,
map<BarcodePair, map<string, long> > &barcode_log) {
Input *fh_1=NULL, *fh_2=NULL;
RawRead *r_1=NULL, *r_2=NULL;
- ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
+ vector<ofstream *> discard_fhs_1(num_output_threads), discard_fhs_2(num_output_threads);
int return_val = 1;
@@ -668,20 +695,29 @@ process_paired_reads_parallel(string prefix_1,
// Open a file for recording discarded reads
//
if (discards) {
- path_1 = out_path + prefix_1 + ".discards";
- discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
-
- if (discard_fh_1->fail()) {
- cerr << "Error opening single-end discard output file '" << path_1 << "'\n";
- exit(1);
- }
+ string dprefix_1 = prefix_1.substr(0, file_suffix_position(prefix_1));
+ string dprefix_2 = prefix_2.substr(0, file_suffix_position(prefix_2));
+
+ string dsuffix = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
+
+ for (uint i = 0; i < discard_fhs_1.size(); i++) {
+ string thread_str = discard_fhs_1.size() > 1 ? "." + to_string(i+1) : "";
+
+ path_1 = out_path + dprefix_1 + thread_str + ".discards" + dsuffix;
+ discard_fhs_1[i] = new ofstream(path_1.c_str(), ifstream::out);
+
+ if (discard_fhs_1[i]->fail()) {
+ cerr << "Error opening single-end discard output file '" << path_1 << "'\n";
+ exit(1);
+ }
- path_2 = out_path + prefix_2 + ".discards";
- discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
+ path_2 = out_path + dprefix_2 + thread_str + ".discards" + dsuffix;
+ discard_fhs_2[i] = new ofstream(path_2.c_str(), ifstream::out);
- if (discard_fh_2->fail()) {
- cerr << "Error opening paired-end discard output file '" << path_2 << "'\n";
- exit(1);
+ if (discard_fhs_2[i]->fail()) {
+ cerr << "Error opening paired-end discard output file '" << path_2 << "'\n";
+ exit(1);
+ }
}
}
@@ -890,12 +926,12 @@ process_paired_reads_parallel(string prefix_1,
if (discards && !r_1->retain)
result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(discard_fh_1, s_1) :
- write_fasta(discard_fh_1, s_1);
+ write_fastq(discard_fhs_1[thread_no - 1], s_1) :
+ write_fasta(discard_fhs_1[thread_no - 1], s_1);
if (discards && !r_2->retain)
result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
- write_fastq(discard_fh_2, s_2) :
- write_fasta(discard_fh_2, s_2);
+ write_fastq(discard_fhs_2[thread_no - 1], s_2) :
+ write_fasta(discard_fhs_2[thread_no - 1], s_2);
if (!result_1 || !result_2) {
cerr << "Error writing to discard file for '" << bc->str() << "'\n";
@@ -1060,8 +1096,10 @@ process_paired_reads_parallel(string prefix_1,
consolidate_counters(counter, local_counters);
if (discards) {
- delete discard_fh_1;
- delete discard_fh_2;
+ for (uint i = 0; i < discard_fhs_1.size(); i++) {
+ delete discard_fhs_1[i];
+ delete discard_fhs_2[i];
+ }
}
delete fh_1;
@@ -1101,7 +1139,10 @@ process_reads(string prefix,
// Open a file for recording discarded reads
//
if (discards) {
- path = out_path + prefix + ".discards";
+ string dprefix = prefix.substr(0, file_suffix_position(prefix));
+ string dsuffix = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
+
+ path = out_path + dprefix + ".discards" + dsuffix;
discard_fh = new ofstream(path.c_str(), ifstream::out);
if (discard_fh->fail()) {
@@ -1242,8 +1283,8 @@ process_reads_parallel(string prefix,
map<string, long> &counter,
map<BarcodePair, map<string, long>> &barcode_log)
{
- Input *fh = NULL;
- ofstream *discard_fh = NULL;
+ Input *fh = NULL;
+ vector<ofstream *> discard_fhs(num_output_threads);
int return_val = 1;
@@ -1262,12 +1303,18 @@ process_reads_parallel(string prefix,
// Open a file for recording discarded reads
//
if (discards) {
- path = out_path + prefix + ".discards";
- discard_fh = new ofstream(path.c_str(), ifstream::out);
+ string dprefix = prefix.substr(0, file_suffix_position(prefix));
+ string dsuffix = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ? ".fq" : ".fa";
- if (discard_fh->fail()) {
- cerr << "Error opening discard output file '" << path << "'\n";
- exit(1);
+ for (uint i = 0; i < discard_fhs.size(); i++) {
+ string thread_str = discard_fhs.size() > 1 ? + "." + to_string(i+1) : "";
+ path = out_path + dprefix + thread_str +".discards" + dsuffix;
+ discard_fhs[i] = new ofstream(path.c_str(), ifstream::out);
+
+ if (discard_fhs[i]->fail()) {
+ cerr << "Error opening discard output file '" << path << "'\n";
+ exit(1);
+ }
}
}
@@ -1439,10 +1486,9 @@ process_reads_parallel(string prefix,
}
if (discards && !r->retain) {
- #pragma omp critical
result = out_file_type == FileT::fastq || out_file_type == FileT::gzfastq ?
- write_fastq(discard_fh, s) :
- write_fasta(discard_fh, s);
+ write_fastq(discard_fhs[thread_no - 1], s) :
+ write_fasta(discard_fhs[thread_no - 1], s);
}
if (!result) {
@@ -1572,8 +1618,12 @@ process_reads_parallel(string prefix,
consolidate_barcode_logs(barcode_log, local_barcode_log);
consolidate_counters(counter, local_counters);
- if (discards) delete discard_fh;
-
+ if (discards) {
+ for (uint i = 0; i < discard_fhs.size(); i++) {
+ delete discard_fhs[i];
+ }
+ }
+
//
// Close the file and delete the Input object.
//
@@ -2092,9 +2142,10 @@ int parse_command_line(int argc, char* argv[]) {
{"interleaved", no_argument, NULL, 'I'},
{"merge", no_argument, NULL, 'm'},
{"disable-rad-check", no_argument, NULL, 'R'}, {"disable_rad_check", no_argument, NULL, 'R'},
- {"filter-illumina", no_argument, NULL, 'F'}, {"filter_illumina", no_argument, NULL, 'F'},
- {"retain-header", no_argument, NULL, 'H'}, {"retain_header", no_argument, NULL, 'H'},
+ {"filter-illumina", no_argument, NULL, 'F'},
+ {"retain-header", no_argument, NULL, 'H'},
{"bestrad", no_argument, NULL, 1000},
+ {"methylrad", no_argument, NULL, 1002},
{"null-index", no_argument, NULL, 'U'}, {"null_index", no_argument, NULL, 'U'},
{"index-null", no_argument, NULL, 'u'}, {"index_null", no_argument, NULL, 'u'},
{"inline-null", no_argument, NULL, 'V'}, {"inline_null", no_argument, NULL, 'V'},
@@ -2196,6 +2247,9 @@ int parse_command_line(int argc, char* argv[]) {
case 1001:
num_threads = is_integer(optarg);
break;
+ case 1002:
+ methylrad = true;
+ break;
case 'B':
barcode_dist_1 = is_integer(optarg);
break;
@@ -2471,7 +2525,8 @@ void help() {
cerr << "\n"
<< " Protocol-specific options:\n"
- << " --bestrad: library was generated using BestRAD, check for restriction enzyme on either read and potentially tranpose reads.\n\n"
+ << " --bestrad: library was generated using BestRAD, check for restriction enzyme on either read and potentially tranpose reads.\n"
+ << " --methylrad: library was generated using enzymatic methyl-seq (EM-seq) or bisulphite sequencing.\n\n"
<< " Adapter options:\n"
<< " --adapter-1 <sequence>: provide adaptor sequence that may occur on the single-end read for filtering.\n"
<< " --adapter-2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"
=====================================
src/renz.cc
=====================================
@@ -142,9 +142,18 @@ const char *xbaI[] = {"CTAGA", // T/CTAGA, XbaI
const char *xhoI[] = {"TCGAG", // C/TCGAG, XhoI
"CTCGA"};
-void
-initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len) {
+//
+// Methylation-converted restriction enzyme sequences: Gs -> As
+//
+const char *pstI_methyl[] = {"TACAA", // CTGCA/G, PstI
+ "TTGTA"};
+const char *sbfI_methyl[] = {"TACAAA", // CCTGCA/GG, SbfI
+ "TTTGTA"};
+
+void
+initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len)
+{
renz["sbfI"] = sbfI; // CCTGCA/GG, SbfI
renz["pstI"] = pstI; // CTGCA/G, PstI
renz["pstIshi"] = pstIshi; // CTGCA/G, PstI-SHI
@@ -333,7 +342,17 @@ initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, ma
}
void
-initialize_renz_olap(map<string, int> &renz_olap) {
+initialize_renz_methylconv(map<string, const char **> &renz)
+{
+ renz.clear();
+
+ renz["sbfI"] = sbfI_methyl; // CCTGCA/GG, SbfI
+ renz["pstI"] = pstI_methyl; // CTGCA/G, PstI
+}
+
+void
+initialize_renz_olap(map<string, int> &renz_olap)
+{
renz_olap["sbfI"] = 4;
}
=====================================
src/renz.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2011-2016, Julian Catchen <jcatchen at illinois.edu>
+// Copyright 2011-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -28,5 +28,6 @@
void initialize_renz_olap(map<string, int> &renz_olap);
void initialize_renz(map<string, const char **> &renz, map<string, int> &renz_cnt, map<string, int> &renz_len);
+void initialize_renz_methylconv(map<string, const char **> &renz);
#endif // __RENZ_H__
=====================================
src/stacks.h
=====================================
@@ -1,6 +1,6 @@
// -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
//
-// Copyright 2010, Julian Catchen <jcatchen at uoregon.edu>
+// Copyright 2010-2023, Julian Catchen <jcatchen at illinois.edu>
//
// This file is part of Stacks.
//
@@ -43,10 +43,11 @@
typedef unsigned int uint;
typedef string allele_type;
-enum snp_type {snp_type_het, snp_type_hom, snp_type_unk, snp_type_discarded};
-enum read_type {primary, secondary};
-enum searcht {sequence, genomic_loc};
-enum corr_type {p_value, bonferroni_win, bonferroni_gen, no_correction};
+enum snp_type {snp_type_het, snp_type_hom, snp_type_unk, snp_type_discarded};
+enum read_type {primary, secondary};
+enum searcht {sequence, genomic_loc};
+enum corr_type {p_value, bonferroni_win, bonferroni_gen, no_correction};
+enum methylt {unmethyl_cpg, unmethyl_chg, unmethyl_chh, unmethyl_unk, irrelevant};
class SNP {
public:
@@ -275,9 +276,14 @@ struct Read {
};
struct SiteCounts {
- Counts<Nt2> tot; // The sum over all samples.
- vector<Counts<Nt2>> samples; // With size() == mpopi->samples().size()/
- const MetaPopInfo* mpopi;
+ Counts<Nt2> tot; // The sum over all samples.
+ vector<Counts<Nt2>> samples; // With size() == mpopi->samples().size()
+ const MetaPopInfo *mpopi;
+};
+
+struct SiteMethylCounts {
+ vector<methylt> samples; // With size() == mpopi->samples().size()
+ const MetaPopInfo *mpopi;
};
//
View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/60c4eb63eeee0774189faa71b77c8b2a3574c1b0
--
View it on GitLab: https://salsa.debian.org/med-team/stacks/-/commit/60c4eb63eeee0774189faa71b77c8b2a3574c1b0
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240119/257b38a3/attachment-0001.htm>
More information about the debian-med-commit
mailing list