[med-svn] [Git][med-team/discosnp][upstream] New upstream version 2.4.3
Andreas Tille
gitlab at salsa.debian.org
Mon Jan 27 14:44:58 GMT 2020
Andreas Tille pushed to branch upstream at Debian Med / discosnp
Commits:
5d31fa29 by Andreas Tille at 2020-01-27T14:14:14+01:00
New upstream version 2.4.3
- - - - -
4 changed files:
- CMakeLists.txt
- discoSnpRAD/README.md
- discoSnpRAD/clustering_scripts/discoRAD_clustering.sh
- discoSnpRAD/run_discoSnpRad.sh
Changes:
=====================================
CMakeLists.txt
=====================================
@@ -10,7 +10,7 @@ cmake_minimum_required(VERSION 2.6)
################################################################################
SET (gatb-tool_VERSION_MAJOR 2)
SET (gatb-tool_VERSION_MINOR 4)
-SET (gatb-tool_VERSION_PATCH 2)
+SET (gatb-tool_VERSION_PATCH 3)
IF (DEFINED MAJOR)
SET (gatb-tool_VERSION_MAJOR ${MAJOR})
@@ -84,6 +84,7 @@ SET (CPACK_SOURCE_IGNORE_FILES
)
# For creating the BINARY package we include the files we want
+INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/discoSnpRAD DESTINATION .)
INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doc DESTINATION .)
INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test DESTINATION .)
INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts DESTINATION . FILES_MATCHING REGEX ".*\\.(py|sh)$" PATTERN "jenkins" EXCLUDE)
=====================================
discoSnpRAD/README.md
=====================================
@@ -11,22 +11,25 @@ Gauthier, J., Mouden, C., Suchan, T., Alvarez, N., Arrigo, N., Riou, C., Lemait
## Installation
-* discoSnp++
+* discoSnp++ (see [../README.md](../README.md))
* `short_read_connector` must have been downloaded and installed (clustering task). [https://github.com/GATB/short_read_connector](https://github.com/GATB/short_read_connector)
## Usage
```
-./run_discoSnpRad.sh --fof read_file_of_files --src_path <directory> [discoSnp++ OPTIONS]
+./run_discoSnpRad.sh -r read_file_of_files -S -p myDiscoSnpRADresult [discoSnp++ OPTIONS]
```
-Clustering
+**Clustering option** (RAD-specific option):
+
```
--S|--src_path <directory>
- **absolute** path to short_read_connector directory, containing the "short_read_connector.sh" file.
+-S|--src [src_path]
+ performs clustering of variants with short_read_connector
+ src_path: **absolute** path to short_read_connector directory, containing the "short_read_connector.sh" file.
-Note1: short read connector must be compiled.
- -Note2: with this option, discoSnpRad provide a vcf file containing SNPs and INDELS, clustered by locus
+ -Note2: if no value is given, it assumes short_read_connector.sh is in the PATH env variable.
+ -Note3: with this option, discoSnpRad outputs a vcf file containing the variants clustered by locus.
```
All other options are described in [discoSnp++ README](../README.md). Note that many discoSNP++ parameters have here default values, specifically adapted to RAD-seq data.
@@ -39,8 +42,33 @@ To see all options:
## Output
-* a log file reminds all filtering steps applied and the name of the output .vcf file
-* a vcf file containing results of filtering and clustering
+When run with output prefix name `myDiscoSnpRADresult`, the main output file is :
+
+* `myDiscoSnpRADresult_[parameter_values]_clustered.vcf`: the final set of variants, with various information, including clustering per locus information (see VCF format below).
+* or `myDiscoSnpRADresult_[parameter_values].vcf` if no clustering was performed.
+
+Additionnally, several other files are output that can be usefull :
+
+* `myDiscoSnpRADresult_[parameter_values]_raw.fa`: the raw set of variants in fasta format, prior to any filtering and clustering steps.
+* `myDiscoSnpRADresult_[graph_parameter_values].h5`: the de Bruijn graph in h5 format (reusable with any GATB tool)
+* `myDiscoSnpRADresult_read_files_correspondance.txt`: the correspondence between read file names and IDs given as genotypes in the vcf
+* the standard output reminds all filtering steps applied and the name of the output .vcf file
+
+#### VCF format
+
+Each variant is described with:
+
+* an ID: `ID` column,
+
+* two alleles (`REF` and `ALT` columns),
+
+* a quality value: `INFO` column, `Rk`, between 0 (bad) and 1 (best),
+
+* some clustering information: `INFO` field: with the locus id (`Cluster`) and its number of varying sites (`ClSize`),
+
+* and for each sample in the genotype columns (`G1`, `G2`,...): the inferred genotype (`0/0`, `0/1`, `1/1`or `./.`for missing value), the read depths (`RD` total, `AD`per allele), among others.
+
+
## Content of this directory
=====================================
discoSnpRAD/clustering_scripts/discoRAD_clustering.sh
=====================================
@@ -22,7 +22,7 @@ echo "this script manages bubble clustering from a discofile.fa file, and the in
echo " 1/ Remove variants with more than 95% missing genotypes and low rank (<0.4)"
echo " 2/ Cluster variants per locus"
echo " 3/ Format the variants in a vcf file with cluster information"
-echo "Usage: ./discoRAD_clustering.sh -f discofile -s SRC_directory/ -o output_file.vcf"
+echo "Usage: ./discoRAD_clustering.sh -f discofile -s SRC_path -o output_file.vcf"
# echo "nb: all options are MANDATORY\n"
echo "OPTIONS:"
echo "\t -f: DiscoSnp fasta output containing coherent predictions"
@@ -44,7 +44,7 @@ while getopts "f:s:o:hw" opt; do
;;
s)
- short_read_connector_directory=$OPTARG
+ short_read_connector_path=$OPTARG
;;
o)
@@ -66,7 +66,7 @@ if [[ -z "${rawdiscofile}" ]]; then
echo "${red}-f is mandatory$reset" >&2
exit
fi
-if [[ -z "${short_read_connector_directory}" ]]; then
+if [[ -z "${short_read_connector_path}" ]]; then
echo "${red}-s is mandatory$reset" >&2
exit
fi
@@ -80,7 +80,7 @@ EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
if [ -d "$EDIR/../../build/" ] ; then # VERSION SOURCE COMPILED
BINDIR=$EDIR"/../../build/bin"
else # VERSION BINARY
-BINDIR=$EDIR"/../../bin"
+BINDIR=$EDIR/../../bin
fi
rawdiscofile_base=$( basename "${rawdiscofile}" .fa)
@@ -150,7 +150,7 @@ fi
#ls ${disco_simpler}.fa > ${disco_simpler}.fof
# Compute sequence similarities
-cmdSRC="${short_read_connector_directory}/short_read_connector.sh -b ${disco_simpler}.fa -q ${disco_simpler}.fof -s 0 -k ${usedk} -a 1 -l -p ${disco_simpler} 1>&2 "
+cmdSRC="${short_read_connector_path} -b ${disco_simpler}.fa -q ${disco_simpler}.fof -s 0 -k ${usedk} -a 1 -l -p ${disco_simpler} 1>&2 "
echo $green$cmdSRC$cyan
if [[ "$wraith" == "false" ]]; then
eval $cmdSRC
=====================================
discoSnpRAD/run_discoSnpRad.sh
=====================================
@@ -72,6 +72,7 @@ output_coverage_option=""
genotyping="-genotype"
remove=1
verbose=1
+clustering="false"
short_read_connector_path=""
option_phase_variants=""
#EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
@@ -106,17 +107,19 @@ function help {
echo " ************"
echo "run_discoSnpRad.sh, pipelining kissnp2 and kissreads and clustering per locus for calling SNPs and small indels from RAD-seq data without the need of a reference genome"
echo "Version "$version
- echo "Usage: ./run_discoSnpRad.sh --fof read_file_of_files --src_path <directory> [OPTIONS]"
+ echo "Usage: ./run_discoSnpRad.sh --fof read_file_of_files --src [src_path] [OPTIONS]"
echo -e "MANDATORY"
echo -e "\t -r|--fof <file name of a file of file(s)>"
echo -e "\t\t The input read files indicated in a file of file(s)"
echo -e "\t\t Example: -r bank.fof with bank.fof containing the two lines \n\t\t\t data_sample/reads_sequence1.fasta\n\t\t\t data_sample/reads_sequence2.fasta.gz"
echo -e "\nOPTIONS"
- echo -e "\t -S|--src_path <directory>"
- echo -e "\t\t **absolute** path to short_read_connector directory, containing the \"short_read_connector.sh\" file. "
+ echo -e "\t -S|--src [src_path]"
+ echo -e "\t\t performs clustering of variants with short_read_connector"
+ echo -e "\t\t src_path: **absolute** path to short_read_connector directory, containing the \"short_read_connector.sh\" file. "
echo -e "\t\t -Note1: short read connector must be compiled."
- echo -e "\t\t -Note2: with this option, discoSnpRad provide a vcf file containing SNPs and INDELS, clustered by locus"
+ echo -e "\t\t -Note2: if no value is given, it assumes short_read_connector.sh is in the PATH env variable."
+ echo -e "\t\t -Note3: with this option, discoSnpRad outputs a vcf file containing the variants clustered by locus"
echo -e "\t -k | --k_size value <int value>"
echo -e "\t\t Set the length of used kmers. Must fit the compiled value."
@@ -184,15 +187,13 @@ while :; do
-w)
wraith="true"
;;
- -S|--src_path)
+ -S|--src)
+ clustering="true"
if [ "$2" ] && [ ${2:0:1} != "-" ] ; then # checks that there exists a second value and its is not the start of the next option
short_read_connector_path=$2
shift
- else
- die 'ERROR: "'$1'" option requires a non-empty option argument.'
fi
;;
-
-a|--ambiguity_max_size)
if [ "$2" ] && [ ${2:0:1} != "-" ] ; then # checks that there exists a second value and its is not the start of the next option
max_ambigous_indel=$2
@@ -379,27 +380,40 @@ if [ -z "$read_sets" ]; then
exit 1
fi
-src_file="$short_read_connector_path/short_read_connector.sh"
-if [[ "$wraith" == "false" ]]; then
- echo $yellow${src_file}$reset
-fi
+#Checks if clustering can be performed
-if [[ "$wraith" == "false" ]]; then
- if [ -f "$src_file" ]; then
- if [[ "$wraith" == "false" ]]; then
- echo "${yellow}short_read_connector is $src_file$reset"
- fi
- else
- if [[ "$wraith" == "false" ]]; then
- echo -e "${red}\t\t\t**************************************************************************"
+if [[ "$clustering" == "true" ]]; then
+ # first tests the directory given by user if any
+ if [ -n "$short_read_connector_path" ]; then
+ src_file="$short_read_connector_path/short_read_connector.sh"
+ if [ -f "$src_file" ]; then
+ echo "${yellow}short_read_connector path is $src_file$reset"
+ else
+ echo -e "${red}\t\t\t**************************************************************************"
echo -e "\t\t\t** WARNING: I cannot find short_read_connector (-S). "
echo -e "\t\t\t** $src_file does not exist"
echo -e "\t\t\t** I will not cluster variants per RAD locus"
echo -e "\t\t\t**************************************************************************"
echo $reset
- fi
+ clustering="false"
+ fi
+ else
+ #then tests if src is in the PATH env variable
+ src_file=$(command -v short_read_connector.sh)
+ if [ -n "$src_file" ]; then
+ echo "${yellow}short_read_connector path is $src_file$reset"
+ else
+ echo -e "${red}\t\t\t**************************************************************************"
+ echo -e "\t\t\t** WARNING: I cannot find short_read_connector in PATH. "
+ echo -e "\t\t\t** Try giving the absolute path of short_read_connector directory with option -S"
+ echo -e "\t\t\t** I will not cluster variants per RAD locus"
+ echo -e "\t\t\t**************************************************************************"
+ echo $reset
+ clustering="false"
+ fi
fi
-fi
+fi
+
######### CHECK THE k PARITY ##########
@@ -643,13 +657,13 @@ echo -e "\t######## CLUSTERING PER LOCUS AND/OR FORMATTING ###############"
echo -e "\t###############################################################$reset"
T="$(date +%s)"
-if [ -f "$src_file" ]; then
+if [[ "$clustering" == "true" ]]; then
if [[ "$wraith" == "false" ]]; then
echo "${yellow}Clustering and vcf formmatting$reset"
fi
final_output="${kissprefix}_clustered.vcf"
- cmd="$EDIR/clustering_scripts/discoRAD_clustering.sh -f ${kissprefix}_raw.fa -s $short_read_connector_path -o ${final_output}"
- echo $green$cmd$cyan
+ cmd="$EDIR/clustering_scripts/discoRAD_clustering.sh -f ${kissprefix}_raw.fa -s $src_file -o ${final_output}"
+ echo $green$cmd$cyan$reset
if [[ "$wraith" == "false" ]]; then
eval $cmd
fi
@@ -668,7 +682,7 @@ else
fi
final_output="${kissprefix}.vcf"
cmd="python3 $EDIR/../scripts/create_filtered_vcf.py -i ${kissprefix}_raw.fa -o ${final_output} -m 0.95 -r 0.4"
- echo $green$cmd$cyan
+ echo $green$cmd$cyan$reset
if [[ "$wraith" == "false" ]]; then
eval $cmd
fi
View it on GitLab: https://salsa.debian.org/med-team/discosnp/commit/5d31fa2928d45f3c3b84b2afc3b28c7120176bcd
--
View it on GitLab: https://salsa.debian.org/med-team/discosnp/commit/5d31fa2928d45f3c3b84b2afc3b28c7120176bcd
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200127/232000be/attachment-0001.html>
More information about the debian-med-commit
mailing list