[med-svn] [Git][med-team/ivar][upstream] New upstream version 1.3.1+dfsg
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Fri Aug 27 21:32:25 BST 2021
Étienne Mollier pushed to branch upstream at Debian Med / ivar
Commits:
b02028a7 by Étienne Mollier at 2021-08-27T20:18:36+02:00
New upstream version 1.3.1+dfsg
- - - - -
13 changed files:
- configure.ac
- src/interval_tree.cpp
- src/interval_tree.h
- src/ivar.cpp
- src/primer_bed.cpp
- src/primer_bed.h
- src/trim_primer_quality.cpp
- src/trim_primer_quality.h
- tests/test_amplicon_search.cpp
- tests/test_interval_tree.cpp
- tests/test_isize_trim.cpp
- tests/test_primer_trim_edge_cases.cpp
- tests/test_trim.cpp
Changes:
=====================================
configure.ac
=====================================
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.63])
-AC_INIT([ivar], [1.3], [gkarthik at scripps.edu])
+AC_INIT([ivar], [1.3.1], [gkarthik at scripps.edu])
AM_INIT_AUTOMAKE([foreign subdir-objects])
AC_CONFIG_HEADERS([config.h])
=====================================
src/interval_tree.cpp
=====================================
@@ -38,13 +38,13 @@ void IntervalTree::insert(ITNode *root, Interval data){
}
}
// update max value of ancestor node
- if(root->max < data.low)
- root->max = data.low;
+ if(root->max < data.high)
+ root->max = data.high;
}
-// A utility function to check if given two intervals overlap
-bool doOverlap(Interval i1, Interval i2){
+// A utility function to check if the 1st interval envelops the second
+bool doEnvelop(Interval i1, Interval i2){
if(i1.low <= i2.low && i1.high >= i2.high)
return true;
return false;
@@ -53,23 +53,23 @@ bool doOverlap(Interval i1, Interval i2){
// The main function that searches an interval i in a given
// Interval Tree.
-bool IntervalTree::overlapSearch(ITNode *root, Interval i){
+bool IntervalTree::envelopSearch(ITNode *root, Interval i){
// Base Case, tree is empty
//std::cout << root->data->low << ":" << root->data->high << std::endl;
if (root == NULL) return false;
// If given interval overlaps with root
- if (doOverlap(*(root->data), i))
+ if (doEnvelop(*(root->data), i))
return true;
// If left child of root is present and max of left child is
// greater than or equal to given interval, then i may
- // overlap with an interval in left subtree
- if (root->left != NULL && root->left->max >= i.low)
- return overlapSearch(root->left, i);
+ // be enveloped by an amplicon in left subtree
+ if (root->left != NULL && root->left->max >= i.high)
+ return envelopSearch(root->left, i);
- // Else interval can only overlap with right subtree
- return overlapSearch(root->right, i);
+ // Else interval can only be enveloped by amplicon in right subtree
+ return envelopSearch(root->right, i);
}
// A helper function for inorder traversal of the tree
=====================================
src/interval_tree.h
=====================================
@@ -25,7 +25,7 @@ class ITNode{
void setRight(ITNode *node);
*/
public:
- ITNode(Interval value): data(new Interval(value)), left(nullptr), right(nullptr), max(value.low) {} // constructor
+ ITNode(Interval value): data(new Interval(value)), left(nullptr), right(nullptr), max(value.high) {} // constructor
Interval *data; // pointer to node's interval data object
ITNode *left, *right; // pointer to node's left & right child node objects
int max;
@@ -38,13 +38,13 @@ class IntervalTree{
private:
ITNode *_root;
void insert(ITNode *root, Interval data);
- bool overlapSearch(ITNode *root, Interval data);
+ bool envelopSearch(ITNode *root, Interval data);
void inOrder(ITNode * root);
public:
IntervalTree(); // constructor
void insert(Interval data){ insert(_root, data);}
- bool overlapSearch(Interval data){ return overlapSearch(_root, data);}
+ bool envelopSearch(Interval data){ return envelopSearch(_root, data);}
void inOrder() {inOrder(_root);}
};
=====================================
src/ivar.cpp
=====================================
@@ -16,7 +16,7 @@
#include "suffix_tree.h"
#include "get_common_variants.h"
-const std::string VERSION = "1.3";
+const std::string VERSION = "1.3.1";
struct args_t {
std::string bam; // -i
@@ -37,6 +37,7 @@ struct args_t {
char gap; // -n
bool keep_min_coverage; // -k
std::string primer_pair_file; // -f
+ int32_t primer_offset; // -x
std::string file_list; // -f
bool write_no_primers_flag; // -e
std::string gff; // -g
@@ -65,8 +66,9 @@ void print_trim_usage(){
"Input Options Description\n"
" -i (Required) Sorted bam file, with aligned reads, to trim primers and quality\n"
" -b BED file with primer sequences and positions. If no BED file is specified, only quality trimming will be done.\n"
- " -f Primer pair information file containing left and right primer names for the same amplicon separated by a tab\n"
- " If provided, reads will be filtered based on their overlap with amplicons prior to trimming\n"
+ " -f [EXPERIMENTAL] Primer pair information file containing left and right primer names for the same amplicon separated by a tab\n"
+ " If provided, reads that do not fall within atleat one amplicon will be ignored prior to primer trimming.\n"
+ " -x Primer position offset (Default: 0). Reads that occur at the specified offset positions relative to primer positions will also be trimmed.\n"
" -m Minimum length of read to retain after trimming (Default: 30)\n"
" -q Minimum quality threshold for sliding window to pass (Default: 20)\n"
" -s Width of sliding window (Default: 4)\n"
@@ -165,7 +167,7 @@ void print_version_info(){
"\nPlease raise issues and bug reports at https://github.com/andersen-lab/ivar/\n\n";
}
-static const char *trim_opt_str = "i:b:f:p:m:q:s:ekh?";
+static const char *trim_opt_str = "i:b:f:x:p:m:q:s:ekh?";
static const char *variants_opt_str = "p:t:q:m:r:g:h?";
static const char *consensus_opt_str = "i:p:q:t:m:n:kh?";
static const char *removereads_opt_str = "i:p:t:b:h?";
@@ -221,6 +223,7 @@ int main(int argc, char* argv[]){
g_args.keep_for_reanalysis = false;
g_args.bed = "";
g_args.primer_pair_file = "";
+ g_args.primer_offset = 0;
opt = getopt( argc, argv, trim_opt_str);
while( opt != -1 ) {
switch( opt ) {
@@ -232,6 +235,9 @@ int main(int argc, char* argv[]){
break;
case 'f':
g_args.primer_pair_file = optarg;
+ break;
+ case 'x':
+ g_args.primer_offset = std::stoi(optarg);
break;
case 'p':
g_args.prefix = optarg;
@@ -264,7 +270,7 @@ int main(int argc, char* argv[]){
return -1;
}
g_args.prefix = get_filename_without_extension(g_args.prefix,".bam");
- res = trim_bam_qual_primer(g_args.bam, g_args.bed, g_args.prefix, g_args.region, g_args.min_qual, g_args.sliding_window, cl_cmd.str(), g_args.write_no_primers_flag, g_args.keep_for_reanalysis, g_args.min_length, g_args.primer_pair_file);
+ res = trim_bam_qual_primer(g_args.bam, g_args.bed, g_args.prefix, g_args.region, g_args.min_qual, g_args.sliding_window, cl_cmd.str(), g_args.write_no_primers_flag, g_args.keep_for_reanalysis, g_args.min_length, g_args.primer_pair_file, g_args.primer_offset);
}
// ivar variants
else if (cmd.compare("variants") == 0){
=====================================
src/primer_bed.cpp
=====================================
@@ -85,6 +85,75 @@ void print_bed_format(){
std::cout << "It requires the following columns delimited by a tab: chrom, chromStart, chromEnd, name, score, strand" << std::endl;
}
+std::vector<primer> populate_from_file(std::string path, int32_t offset = 0){
+ std::ifstream data(path.c_str());
+ std::string line;
+ std::vector<primer> primers;
+ int16_t indice = 0;
+ while(std::getline(data,line)){ // Remove extra lineStream
+ std::stringstream lineStream(line);
+ std::string cell;
+ int ctr = 0;
+ primer p;
+ p.set_strand(0); // Set strand to NULL
+ while(std::getline(lineStream,cell,'\t')){
+ switch(ctr){
+ case 0:
+ p.set_region(cell);
+ break;
+ case 1:
+ if(std::all_of(cell.begin(), cell.end(), ::isdigit)) {
+ p.set_start(std::stoul(cell) - offset);
+ } else {
+ print_bed_format();
+ primers.clear();
+ return primers;
+ }
+ break;
+ case 2:
+ if(std::all_of(cell.begin(), cell.end(), ::isdigit)) {
+ p.set_end(std::stoul(cell) - 1 + offset); // Bed format - End is not 0 based
+ } else {
+ print_bed_format();
+ primers.clear();
+ return primers;
+ }
+ break;
+ case 3:
+ p.set_name(cell);
+ break;
+ case 4:
+ if(std::all_of(cell.begin(), cell.end(), ::isdigit)) {
+ p.set_score(stoi(cell));
+ } else {
+ print_bed_format(); // score is missing, send warning but continue populating
+ std::cout << "\nWARNING: The BED file provided did not have the expected score column, but iVar will continue trimming\n" << std::endl;
+ p.set_score(-1);
+ }
+ break;
+ case 5:
+ if(cell[0] == '+' || cell[0] == '-')
+ p.set_strand(cell[0]);
+ else {
+ print_bed_format();
+ primers.clear();
+ return primers;
+ }
+ }
+ ctr++;
+ }
+ if(indice == 0 && ctr < 6)
+ std::cout << "Strand not found in primer BED file so strand will not be considered for trimming" << std::endl;
+ p.set_indice(indice);
+ p.set_pair_indice(-1);
+ p.set_read_count(0);
+ primers.push_back(p);
+ indice++;
+ }
+ std::cout << "Found " << primers.size() << " primers in BED file" << std::endl;
+ return primers;
+}
+
std::vector<primer> populate_from_file(std::string path){
std::ifstream data(path.c_str());
std::string line;
@@ -112,7 +181,7 @@ std::vector<primer> populate_from_file(std::string path){
break;
case 2:
if(std::all_of(cell.begin(), cell.end(), ::isdigit)) {
- p.set_end(std::stoul(cell)-1); // Bed format - End is not 0 based
+ p.set_end(std::stoul(cell) - 1); // Bed format - End is not 0 based
} else {
print_bed_format();
primers.clear();
=====================================
src/primer_bed.h
=====================================
@@ -46,6 +46,7 @@ class primer {
};
+std::vector<primer> populate_from_file(std::string path, int32_t offset);
std::vector<primer> populate_from_file(std::string path);
std::vector<primer> get_primers(std::vector<primer> p, unsigned int pos);
int get_primer_indice(std::vector<primer> p, std::string name);
=====================================
src/trim_primer_quality.cpp
=====================================
@@ -331,7 +331,7 @@ int get_bigger_primer(std::vector<primer> primers){
return max_primer_len;
}
-// check if read overlaps with any of the amplicons
+// check if read is enveloped by any of the amplicons
bool amplicon_filter(IntervalTree amplicons, bam1_t* r){
Interval fragment_coords = Interval(0, 1);
if(r->core.isize > 0){
@@ -342,16 +342,16 @@ bool amplicon_filter(IntervalTree amplicons, bam1_t* r){
fragment_coords.high = bam_endpos(r);
}
// debugging
- bool amplicon_flag = amplicons.overlapSearch(fragment_coords);
+ bool amplicon_flag = amplicons.envelopSearch(fragment_coords);
return amplicon_flag;
}
-int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::string region_, uint8_t min_qual, uint8_t sliding_window, std::string cmd, bool write_no_primer_reads, bool keep_for_reanalysis, int min_length = 30, std::string pair_info = "") {
+int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::string region_, uint8_t min_qual, uint8_t sliding_window, std::string cmd, bool write_no_primer_reads, bool keep_for_reanalysis, int min_length = 30, std::string pair_info = "", int32_t primer_offset = 0) {
int retval = 0;
std::vector<primer> primers;
int max_primer_len = 0;
if(!bed.empty()){
- primers = populate_from_file(bed);
+ primers = populate_from_file(bed, primer_offset);
if(primers.size() == 0){
std::cout << "Exiting." << std::endl;
return -1;
@@ -363,6 +363,8 @@ int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out,
if(!pair_info.empty()){
amplicons = populate_amplicons(pair_info, primers);
}
+ std::cout << "Amplicons detected: " << std::endl;
+ amplicons.inOrder();
if(bam.empty()){
std::cout << "Bam file is empty." << std::endl;
return -1;
@@ -605,7 +607,10 @@ int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out,
std::cout << unmapped_counter << " unmapped reads were not written to file." << std::endl;
}
if(amplicon_flag_ctr > 0){
- std::cout << amplicon_flag_ctr << " reads were ignored because they did not fall within an amplicon" << std::endl;
+ std::cout << round_int(amplicon_flag_ctr, mapped)
+ << "% (" << amplicon_flag_ctr
+ << ") reads were ignored because they did not fall within an amplicon"
+ << std::endl;
}
if(failed_frag_size > 0){
std::cout << round_int(failed_frag_size, mapped)
=====================================
src/trim_primer_quality.h
=====================================
@@ -31,7 +31,7 @@ inline void free_cigar(cigar_ t) { if (t.free_cig) free(t.cigar); }
void add_pg_line_to_header(bam_hdr_t** hdr, char *cmd);
-int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::string region_, uint8_t min_qual, uint8_t sliding_window, std::string cmd, bool write_no_primer_reads, bool mark_qcfail_flag, int min_length, std::string pair_info);
+int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::string region_, uint8_t min_qual, uint8_t sliding_window, std::string cmd, bool write_no_primer_reads, bool mark_qcfail_flag, int min_length, std::string pair_info, int32_t primer_offset);
void free_cigar(cigar_ t);
int32_t get_pos_on_query(uint32_t *cigar, uint32_t ncigar, int32_t pos, int32_t ref_start);
int32_t get_pos_on_reference(uint32_t *cigar, uint32_t ncigar, uint32_t pos, uint32_t ref_start);
=====================================
tests/test_amplicon_search.cpp
=====================================
@@ -5,11 +5,11 @@
#include "../src/primer_bed.h"
#include "../src/interval_tree.h"
-int test_amplicon_search(std::string bam_file, std::string bed_file, std::string pair_info_file, bool expected[])
+int test_amplicon_search(std::string bam_file, std::string bed_file, std::string pair_info_file, int32_t primer_offset, bool expected[])
{
std::vector<primer> primers;
IntervalTree amplicons;
- primers = populate_from_file(bed_file);
+ primers = populate_from_file(bed_file, primer_offset);
std::cout << "Total Number of primers: " << primers.size() << std::endl;
amplicons = populate_amplicons(pair_info_file, primers);
samFile *in = hts_open(bam_file.c_str(), "r");
@@ -35,13 +35,14 @@ int main(){
int success;
std::string bam = "../data/test_amplicon.sorted.bam";
std::string pair_indice = "../data/pair_info_2.tsv";
+ int32_t primer_offset = 0;
std::string bed = "../data/test_isize.bed";
std::string prefix = "/tmp/data/trim_amplicon";
std::string bam_out = "/tmp/trim_amplicon.bam";
IntervalTree amplicons;
bool expected[8] = {true, false, true, false, true, false, true, false};
- success = test_amplicon_search(bam, bed, pair_indice, expected);
+ success = test_amplicon_search(bam, bed, pair_indice, primer_offset, expected);
amplicons.inOrder();
return success;
}
=====================================
tests/test_interval_tree.cpp
=====================================
@@ -7,7 +7,7 @@ int test_itree_overlap(IntervalTree tree, Interval queries[], int num_tests, boo
int result = 0;
for (int i = 0; i < num_tests; i++)
{
- result = tree.overlapSearch(queries[i]);
+ result = tree.envelopSearch(queries[i]);
if (result != expected[i])
{
std::cout << "Interval Tree overlap behavior incorrect for interval " << queries[i].low << ":" << queries[i].high
=====================================
tests/test_isize_trim.cpp
=====================================
@@ -21,6 +21,7 @@ int test_isize_trim(uint8_t min_qual, uint8_t sliding_window, bool no_write_flag
std::string bam = "../data/test_isize.sorted.bam";
std::string bed = "../data/test_isize.bed";
std::string pair_info = "";
+ int32_t primer_offset = 0;
std::string prefix = "../data/trim_isize";
std::string bam_out = "../data/trim_isize.bam";
@@ -29,7 +30,7 @@ int test_isize_trim(uint8_t min_qual, uint8_t sliding_window, bool no_write_flag
std::string cmd = "@PG\tID:ivar-trim\tPN:ivar\tVN:1.0.0\tCL:ivar trim\n";
// Test and check result
- res = trim_bam_qual_primer(bam, bed, prefix, region_, min_qual, sliding_window, cmd, no_write_flag, keep_for_reanalysis, min_length, pair_info);
+ res = trim_bam_qual_primer(bam, bed, prefix, region_, min_qual, sliding_window, cmd, no_write_flag, keep_for_reanalysis, min_length, pair_info, primer_offset);
if (res) {
success = -1;
=====================================
tests/test_primer_trim_edge_cases.cpp
=====================================
@@ -4,7 +4,8 @@
int main(){
int success = 0;
std::string bam = "../data/primer_only/primer_edge_cases.bam";
- std::vector<primer> primers = populate_from_file("../data/test.bed");
+ int32_t primer_offset = 0;
+ std::vector<primer> primers = populate_from_file("../data/test.bed", primer_offset);
int max_primer_len = 0;
max_primer_len = get_bigger_primer(primers);
std::string region_;
=====================================
tests/test_trim.cpp
=====================================
@@ -21,6 +21,7 @@ int test_trim(uint8_t min_qual, uint8_t sliding_window, bool no_write_flag, bool
std::string bam = "../data/test.unmapped.sorted.bam";
std::string bed = "../data/test.bed";
std::string pair_info = "";
+ int32_t primer_offset = 0;
std::string prefix = "/tmp/trim";
std::string bam_out = "/tmp/trim.bam";
@@ -28,7 +29,7 @@ int test_trim(uint8_t min_qual, uint8_t sliding_window, bool no_write_flag, bool
std::string cmd = "@PG\tID:ivar-trim\tPN:ivar\tVN:1.0.0\tCL:ivar trim\n";
// Test and check result
- res = trim_bam_qual_primer(bam, bed, prefix, region_, min_qual, sliding_window, cmd, no_write_flag, keep_for_reanalysis, min_length, pair_info);
+ res = trim_bam_qual_primer(bam, bed, prefix, region_, min_qual, sliding_window, cmd, no_write_flag, keep_for_reanalysis, min_length, pair_info, primer_offset);
if (res) {
success = -1;
std::cerr << testname << " failed: trim_bam_qual_primer() returned " << res << std::endl;
View it on GitLab: https://salsa.debian.org/med-team/ivar/-/commit/b02028a7de0247141fb9648bcd2e4e048363a38c
--
View it on GitLab: https://salsa.debian.org/med-team/ivar/-/commit/b02028a7de0247141fb9648bcd2e4e048363a38c
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210827/8a367f27/attachment-0001.htm>
More information about the debian-med-commit
mailing list