[med-svn] [Git][med-team/vcftools][upstream] New upstream version 0.1.17

Dylan Aïssi (@daissi) gitlab at salsa.debian.org
Fri Aug 22 07:39:13 BST 2025



Dylan Aïssi pushed to branch upstream at Debian Med / vcftools


Commits:
1e72a2dd by Dylan Aïssi at 2025-08-22T08:09:35+02:00
New upstream version 0.1.17
- - - - -


15 changed files:

- .tarball-version
- README.md
- configure.ac
- src/cpp/bcf_entry.cpp
- src/cpp/bcf_entry_setters.cpp
- src/cpp/bcf_file.cpp
- src/cpp/entry.cpp
- src/cpp/entry_filters.cpp
- src/cpp/header.cpp
- src/cpp/parameters.h
- src/cpp/variant_file_output.cpp
- src/cpp/vcf_entry.cpp
- src/cpp/vcf_file.cpp
- src/cpp/vcftools.1
- src/perl/Vcf.pm


Changes:

=====================================
.tarball-version
=====================================
@@ -1 +1 @@
-0.1.16
+0.1.17


=====================================
README.md
=====================================
@@ -65,9 +65,7 @@ man vcftools
 Getting Help
 ------------
 
-The best way to get help regarding VCFtools is to email the mailing list:
-
-vcftools-help at lists.sourceforge.net
+The best way to get help regarding VCFtools is to use the GitHub Issues page.
 
 Citation
 --------


=====================================
configure.ac
=====================================
@@ -63,9 +63,7 @@ AC_SYS_LARGEFILE
 
 # Checks for library functions.
 AC_FUNC_ERROR_AT_LINE
-AC_FUNC_MALLOC
-AC_FUNC_REALLOC
-AC_CHECK_FUNCS([gethostbyaddr gethostbyname memset pow select socket sqrt strchr strdup strerror strstr strtol])
+AC_CHECK_FUNCS([gethostbyaddr gethostbyname malloc memset pow realloc select socket sqrt strchr strdup strerror strstr strtol])
 
 # Optional features.
 AC_ARG_ENABLE([pca],


=====================================
src/cpp/bcf_entry.cpp
=====================================
@@ -244,7 +244,7 @@ void bcf_entry::parse_genotype_entry(unsigned int indv, bool GT, bool GQ, bool D
 		else if ((int)ui == GQ_idx)
 		{
 			if (size>1)
-				LOG.error("Error: Only expect single value for QUALITY.\n");
+				LOG.error("Only expect single value for QUALITY.\n");
 
 			float tmp;
 			if (type==5)
@@ -265,14 +265,14 @@ void bcf_entry::parse_genotype_entry(unsigned int indv, bool GT, bool GQ, bool D
 				tmp = (float)tmp2;
 			}
 			else
-				LOG.error("Error: Invalid type for QUALITY.\n");
+				LOG.error("Invalid type for QUALITY.\n");
 
 			set_indv_GQUALITY(indv, tmp);
 		}
 		else if ((int)ui == DP_idx)
 		{
 			if (size>1)
-				LOG.error("Error: Only expect single value for DEPTH.\n");
+				LOG.error("Only expect single value for DEPTH.\n");
 
 			int tmp = -1;
 
@@ -299,7 +299,7 @@ void bcf_entry::parse_genotype_entry(unsigned int indv, bool GT, bool GQ, bool D
 				tmp = (int)tmp2;
 			}
 			else
-				LOG.error("Error: Invalid type for DEPTH.\n");
+				LOG.error("Invalid type for DEPTH.\n");
 
 			set_indv_DEPTH(indv, tmp);
 		}
@@ -657,7 +657,7 @@ void bcf_entry::print_bcf(BGZF* out, const set<string> &INFO_to_keep, bool keep_
 			else if (map_type == Flag)
 				make_typed_int(tmp_vector, 1, true );
 			else
-				LOG.error("Invalid type in INFO definition", 0);
+				LOG.error("Invalid type in INFO definition");
 
 			out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end());
 		}


=====================================
src/cpp/bcf_entry_setters.cpp
=====================================
@@ -10,6 +10,8 @@
 
 void bcf_entry::set_ALT(const int n_allele)
 {
+	if (n_allele <= 0)
+		LOG.error("Number of alleles must be positive.");
 	ALT.resize(n_allele-1);
 	unsigned int pos = ALT_pos;
 	string allele;


=====================================
src/cpp/bcf_file.cpp
=====================================
@@ -64,7 +64,7 @@ void bcf_file::open()
 	{
 		file_tmp.open(filename.c_str(), ios::in);
 		if (!file_tmp.is_open())
-			LOG.error("Could not open VCF file: " + filename, 0);
+			LOG.error("Could not open VCF file: " + filename);
 		file_in = &file_tmp;
 	}
 }
@@ -80,7 +80,7 @@ void bcf_file::open_gz()
 		gzfile_in = gzopen(filename.c_str(), "rb");
 
 	if (gzfile_in == NULL)
-		LOG.error("Could not open BGZF BCF file: " + filename, 0);
+		LOG.error("Could not open BGZF BCF file: " + filename);
 	#ifdef ZLIB_VERNUM
 		string tmp(ZLIB_VERSION);
 		LOG.printLOG("Using zlib version: " + tmp + "\n");
@@ -117,7 +117,7 @@ void bcf_file::get_entry(vector<char> &out)
 	ret = read(&size_int[0], 2, sizeof(uint32_t) );
 	read_size = size_int[0] + size_int[1];
 
-	if (ret)
+	if (ret && (read_size > 0))
 	{
 		out.resize(read_size+2*sizeof(uint32_t));
 		memcpy(&out[0], size_int, 2*sizeof(uint32_t));


=====================================
src/cpp/entry.cpp
=====================================
@@ -733,10 +733,7 @@ int entry::get_typed_int(unsigned int * line_position, const vector<char>& line,
 	get_type( line_position, line, type, size );
 
 	if (size > 1)
-	{
-		LOG.printLOG("Error: Int vector when expected only a single Integer value.\n" );
-		exit(0);
-	}
+		LOG.error("Int vector when expected only a single Integer value.\n" );
 
 	if (type == 1)
 	{
@@ -760,10 +757,8 @@ int entry::get_typed_int(unsigned int * line_position, const vector<char>& line,
 		out = tmp;
 	}
 	else
-	{
-		LOG.printLOG("Error: Invalid type for integer size.\n");
-		exit(0);
-	}
+		LOG.error("Invalid type for integer size.\n");
+	
 	return out;
 }
 
@@ -808,10 +803,8 @@ vector<int> entry::get_int_vector(unsigned int * line_position, const vector<cha
 		}
 	}
 	else
-	{
-		LOG.printLOG("Error: Invalid type for integer size.\n");
-		exit(0);
-	}
+		LOG.error("Invalid type for integer size.\n");
+
 	return out;
 }
 
@@ -852,10 +845,7 @@ void entry::get_type(unsigned int * line_position, const vector<char>& line, uns
 			size = (unsigned int)tmp;
 		}
 		else
-		{
-			LOG.printLOG("Error: Invalid type for integer size.\n");
-			exit(0);
-		}
+			LOG.error("Invalid type for integer size.\n");
 	}
 }
 


=====================================
src/cpp/entry_filters.cpp
=====================================
@@ -139,7 +139,7 @@ void entry::filter_sites_to_keep(const set<string> &snps_to_keep, const string &
 
 		if (!in.is_open())
 		{
-			LOG.error("Could not open SNPs to Keep file" + snps_to_keep_file, 0);
+			LOG.error("Could not open SNPs to Keep file" + snps_to_keep_file);
 		}
 		while (!in.eof())
 		{
@@ -171,7 +171,7 @@ void entry::filter_sites_to_exclude(const string &snps_to_exclude_file)
 		string tmp;
 		if (!in.is_open())
 		{
-			LOG.error("Could not open SNPs to Exclude file" + snps_to_exclude_file, 0);
+			LOG.error("Could not open SNPs to Exclude file" + snps_to_exclude_file);
 		}
 		while (!in.eof())
 		{


=====================================
src/cpp/header.cpp
=====================================
@@ -20,6 +20,8 @@ void header::parse_meta(const string &line, unsigned int &line_index)
 	lines.push_back(line);
 	if (line.compare(0,13,"##fileformat=")==0)
 	{
+	        if (line.size() < 13)
+        	    LOG.error("Malformed header line: " + line);
 		has_file_format = true;
 		string version = line.substr(13);
 		if ((version != "VCFv4.0") && (version != "VCFv4.1") && (version != "VCFv4.2"))
@@ -27,18 +29,26 @@ void header::parse_meta(const string &line, unsigned int &line_index)
 	}
 	else if (line.compare(0,7,"##INFO=")==0)
 	{	// Found an INFO descriptor
+	        if (line.size() < 8)
+        	    LOG.error("Malformed header line: " + line);
 		line_index += add_INFO_descriptor(line.substr(8, line.size()-8), line_index);
 	}
 	else if (line.compare(0,9,"##FILTER=")==0)
 	{	// Found a FILTER descriptor
+		if (line.size() < 10)
+        	    LOG.error("Malformed header line: " + line);
 		line_index += add_FILTER_descriptor(line.substr(10, line.size()-8), line_index);
 	}
 	else if (line.compare(0,9,"##FORMAT=")==0)
 	{	// Found a genotype filter descriptor
+		if (line.size() < 10)
+        	    LOG.error("Malformed header line: " + line);
 		line_index += add_FORMAT_descriptor(line.substr(10, line.size()-8), line_index);
 	}
 	else if (line.compare(0,9,"##contig=")==0)
 	{	// Found a contig descriptor
+		if (line.size() < 10)
+        	    LOG.error("Malformed header line: " + line);
 		add_CONTIG_descriptor(line.substr(10, line.size()-8), contig_index);
 		contig_index++;
 		has_contigs = true;
@@ -356,7 +366,7 @@ int header::add_FILTER_descriptor(const string &in, int index)
 		tokenize(tokens[ui], '=', entry);
 		if (entry.size() < 2)
 		{
-			LOG.warning("Warning: Expected at least 2 parts in FORMAT entry: " + in);
+			LOG.warning("Warning: Expected at least 2 parts in FILTER entry: " + in);
 			continue;
 		}
 		if (entry[0] == "ID") I.ID = entry[1];


=====================================
src/cpp/parameters.h
=====================================
@@ -193,7 +193,7 @@ public:
 
 private:
 	void check_parameters();
-	static void error(string err_msg, int code);
+	static void error(string err_msg, int code=1);
 
 	vector<string> argv;
 


=====================================
src/cpp/variant_file_output.cpp
=====================================
@@ -2291,7 +2291,7 @@ void variant_file::output_haplotype_r2_of_SNP_list_vs_all_others(const parameter
 	BED.close();
 
 	if (nlist == 0)
-		LOG.error("No sites found in positions file.\n",0);
+		LOG.error("No sites found in positions file.\n");
 
 	LOG.printLOG("\tRead in "+header::int2str(nlist)+" site(s) for LD analysis.\n");
 
@@ -2533,7 +2533,7 @@ void variant_file::output_genotype_r2_of_SNP_list_vs_all_others(const parameters
 	BED.close();
 
 	if (nlist == 0)
-		LOG.error("No sites found in positions file.\n",0);
+		LOG.error("No sites found in positions file.\n");
 
 	LOG.printLOG("\tRead in "+header::int2str(nlist)+" site(s) for LD analysis.\n");
 
@@ -4079,6 +4079,21 @@ void variant_file::output_windowed_nucleotide_diversity(const parameters &params
 		LOG.error("Require Genotypes in VCF file in order to output Nucleotide Diversity Statistics.");
 
 	LOG.printLOG("Outputting Windowed Nucleotide Diversity Statistics...\n");
+
+	ifstream mask;
+	bool use_mask = false;
+	if (params.mask_file != "")
+	{
+		LOG.printLOG("Using mask in Windowed Nucleotide Diversity calculation...\n");
+		mask.open(params.mask_file.c_str());
+		if (!mask.is_open())
+			LOG.error("Could not open mask file: " + params.mask_file);
+		use_mask = true;
+		mask.close();
+	} else {
+		LOG.warning("Calculating Windowed Nucleotide Diversity without a mask can give misleading results. It is recommended you use a mask to define well characterized sites in the genome.");
+	}
+
 	string output_file = params.output_prefix + ".windowed.pi";
 
 	string CHROM;
@@ -4176,24 +4191,78 @@ void variant_file::output_windowed_nucleotide_diversity(const parameters &params
 		buf = cout.rdbuf();
 
 	ostream out(buf);
-	out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tPI" << endl;
+	out << "CHROM\tBIN_START\tBIN_END\tN_VARIANTS\tN_MONOMORPHIC\tPI" << endl;
 
 	unsigned long N_monomorphic_sites = 0;
 	int N_kept_chr = 2*N_kept_individuals();
 	N_comparisons = (N_kept_chr * (N_kept_chr - 1)); 	// Number of pairwise comparisons at a monomorphic site
 	unsigned long N_pairs = 0; 								// Number of pairwise comparisons within a window
 	double pi = 0;
+	vector<char> mask_data; // Used to store mask data if needed.
 
 	for (unsigned int ui=0; ui<chrs.size(); ui++)
 	{
 		CHROM = chrs[ui];
+
+		if (use_mask == true)
+		{
+			LOG.printLOG("Reading mask for chromosome: " + CHROM + "\n");
+	                mask.open(params.mask_file.c_str());
+        	        if (!mask.is_open())
+                        LOG.error("Could not open mask file: " + params.mask_file);
+			string mask_chr = "";
+			string line;
+			while (!mask.eof())
+			{
+				getline(mask, line);
+				line.erase( line.find_last_not_of(" \t") + 1);
+
+				if (line[0] == '>')
+				{
+					mask_chr = line.substr(1, line.find_first_of(" \t")-1);
+					if (mask_chr == CHROM)
+						break;
+				}
+			}
+			mask_data.resize(0);
+			while (!mask.eof())
+			{
+				getline(mask, line);
+				line.erase( line.find_last_not_of(" \t") + 1);
+
+				if (line[0] == '>')
+					break;
+				vector<short> digits(line.begin(), line.end());
+				for (unsigned int uj=0; uj < digits.size(); uj++)
+					digits[uj] -= '0';
+				mask_data.insert(mask_data.end(), digits.begin(), digits.end());
+			}
+			mask.close();
+		}
+
 		for (unsigned int s=0; s<bins[CHROM].size(); s++)
 		{
 			if( (bins[CHROM][s][N_polymorphic_sites] > 0) || (bins[CHROM][s][N_mismatches] > 0) )
 			{
-				// This number can be slightly off for the last bin since the
-				// window size can go off the end of the chromosome.
-				N_monomorphic_sites = window_size - bins[CHROM][s][N_variant_sites];
+				N_monomorphic_sites = 0;
+				if (use_mask == false)
+				{
+					// This number can be slightly off for the last bin since the
+					// window size can go off the end of the chromosome.
+					N_monomorphic_sites = window_size - bins[CHROM][s][N_variant_sites];
+				} else {
+					bool keep;
+					unsigned long N_unmasked_sites = 0;
+					for (unsigned int uj=(s*window_step); uj < min((unsigned int)((s*window_step) + window_size), (unsigned int)mask_data.size()); uj++)
+					{
+						keep = mask_data[uj] <= params.min_kept_mask_value;
+						if (params.invert_mask == true)
+							keep = !keep;
+						if (keep == true)
+							N_unmasked_sites++;
+					}
+					N_monomorphic_sites = N_unmasked_sites - bins[CHROM][s][N_variant_sites];
+				}
 
 				// The total number of possible pairwise comparisons is the sum of
 				// pairwise comparisons at polymorphic sites and pairwise
@@ -4205,7 +4274,8 @@ void variant_file::output_windowed_nucleotide_diversity(const parameters &params
 				    << s*window_step + 1 << "\t"
 				    << (s*window_step + window_size) << "\t"
 				    << bins[CHROM][s][N_polymorphic_sites] << "\t"
-				    << pi << endl;
+				    << N_monomorphic_sites << "\t"
+			        << pi << endl;
 			}
 		}
 	}


=====================================
src/cpp/vcf_entry.cpp
=====================================
@@ -274,7 +274,7 @@ void vcf_entry::parse_FORMAT()
 		else if ( (type == Character) or (type == String) )
 			make_typed_string_vector(tmp_vector, tmp_split, number );
 		else
-			LOG.error("Invalid type in FORMAT definition", 0);
+			LOG.error("Invalid type in FORMAT definition");
 
 		position = 0;
 		get_type(&position, tmp_vector, type, size);
@@ -388,10 +388,10 @@ void vcf_entry::print_bcf(BGZF* out, const set<string> &INFO_to_keep, bool keep_
 
 	tmp_string = get_CHROM();
 	if (tmp_string == "." or tmp_string == " " or tmp_string == "")
-		LOG.error("CHROM value must be defined for all entries.",0);
+		LOG.error("CHROM value must be defined for all entries.");
 
 	if (entry_header.CONTIG_reverse_map.find(tmp_string) == entry_header.CONTIG_reverse_map.end() )
-		LOG.error("CHROM value " + tmp_string + " is not defined on contig dictionary.",0);
+		LOG.error("CHROM value " + tmp_string + " is not defined on contig dictionary.");
 
 	int32_t chrom = (int32_t)entry_header.CONTIG_reverse_map[tmp_string];
 	memcpy(&out_vector[vector_pos], &chrom, sizeof(chrom));
@@ -471,7 +471,7 @@ void vcf_entry::print_bcf(BGZF* out, const set<string> &INFO_to_keep, bool keep_
 		else if (map_type == Flag)
 			make_typed_int(tmp_vector, 1, true );
 		else
-			LOG.error("Invalid type in INFO definition", 0);
+			LOG.error("Invalid type in INFO definition");
 
 		out_vector.insert(out_vector.end(), tmp_vector.begin(), tmp_vector.end());
 	}


=====================================
src/cpp/vcf_file.cpp
=====================================
@@ -57,7 +57,7 @@ void vcf_file::read_header()
 	while (!eof())
 	{
 		read_line(line);
-		if (line[0] == '#')
+		if ((line.size() > 2) && (line[0] == '#'))
 			if (line[1] == '#')
 				meta_data.parse_meta(line, line_index);
 			else
@@ -210,10 +210,10 @@ void vcf_file::open()
 	if (i != 0)
 	{
 		perror("stat error");
-		LOG.error("Can't determine file type of " + filename, 0);
+		LOG.error("Can't determine file type of " + filename);
 	}
 	if (!S_ISREG(buf.st_mode))
-		LOG.error("Does not appear to be a regular file: " + filename, 0);
+		LOG.error("Does not appear to be a regular file: " + filename);
 
 	if (filename.substr(filename.size()-4) == ".bcf")
 		LOG.error("Filename ends in '.bcf'. Shouldn't you be using --bcf?\n");
@@ -224,7 +224,7 @@ void vcf_file::open()
 			LOG.error("Filename ends in '.gz'. Shouldn't you be using --gzvcf or --gzdiff?\n");
 		file_tmp.open(filename.c_str(), ios::in);
 		if (!file_tmp.is_open())
-			LOG.error("Could not open VCF file: " + filename, 0);
+			LOG.error("Could not open VCF file: " + filename);
 
 		file_in = &file_tmp;
 	}
@@ -243,7 +243,7 @@ void vcf_file::open_gz()
 		gzfile_in = gzopen(filename.c_str(), "rb");
 
 	if (gzfile_in == NULL)
-		LOG.error("Could not open GZVCF file: " + filename, 0);
+		LOG.error("Could not open GZVCF file: " + filename);
 	#ifdef ZLIB_VERNUM
 		string tmp(ZLIB_VERSION);
 		LOG.printLOG("Using zlib version: " + tmp + "\n");


=====================================
src/cpp/vcftools.1
=====================================
@@ -1,7 +1,7 @@
 .\" Manpage for vcftools.
-.TH vcftools man page 1 "2 August 2018" "0.1.16" "vcftools man page"
+.TH vcftools man page 1 "2 August 2018" "0.1.17" "vcftools man page"
 .SH NAME
-vcftools v0.1.16 \- Utilities for the variant call format (VCF) and binary variant call format (BCF)
+vcftools v0.1.17 \- Utilities for the variant call format (VCF) and binary variant call format (BCF)
 .SH SYNOPSIS
 .B vcftools
 [
@@ -169,7 +169,7 @@ An example mask file would look like:
 .br
 .I 2222211111000...
 .RE
-In this example, sites in the VCF file located within the first 5 bases of the start of chromosome 1 would be kept, whereas sites at position 6 onwards would be filtered out. And sites after the 11th position on chromosome 2 would be filtered out as well.
+In this example, sites in the VCF file located within the first 5 bases of the start of chromosome 1 would be kept, whereas sites at position 6 onwards would be filtered out. And sites before the 11th position on chromosome 2 would be filtered out as well.
 .br
 The "--invert-mask" option takes the same format mask file as the "--mask" option, however it inverts the mask file before filtering with it.
 .br
@@ -529,6 +529,8 @@ Measures nucleotide divergency on a per-site basis. The output file has the suff
 .I <integer>
 .RS 2
 Measures the nucleotide diversity in windows, with the number provided as the window size. The output file has the suffix ".windowed.pi". The latter is an optional argument used to specify the step size in between windows.
+.br
+Note: vcftools can make use of a mask (defined using the --mask parameter) to define which sites have been well characterized for the estimation of nucleotide diversity. Using a mask to define the well-characterized portion of the genome is recommended when estimating nucleotide diversity, as (for example) genetic variants may be poorly characterized in low-coverage or poorly sequenced regions of the genome.
 .RE
 .SS OUTPUT FST STATISTICS
 .RS 2


=====================================
src/perl/Vcf.pm
=====================================
@@ -201,7 +201,7 @@ sub _open
         $tabix_args .= qq['$$self{file}'];
         if ( exists($args{region}) && defined($args{region}) ) { $tabix_args .= qq[ '$args{region}']; }
 
-        if ( -e $$self{file} && $$self{file}=~/\.gz/i )
+        if ( -e $$self{file} && $$self{file}=~/\.gz$/i )
         {
             if ( exists($args{region}) && defined($args{region}) )
             {



View it on GitLab: https://salsa.debian.org/med-team/vcftools/-/commit/1e72a2dd49d729525a3db8868277a8bbf5657136

-- 
View it on GitLab: https://salsa.debian.org/med-team/vcftools/-/commit/1e72a2dd49d729525a3db8868277a8bbf5657136
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20250822/7021490d/attachment-0001.htm>


More information about the debian-med-commit mailing list