[med-svn] [Git][med-team/discosnp][upstream] New upstream version 2.4.3

Mon Jan 27 14:44:58 GMT 2020


Andreas Tille pushed to branch upstream at Debian Med / discosnp


Commits:
5d31fa29 by Andreas Tille at 2020-01-27T14:14:14+01:00
New upstream version 2.4.3
- - - - -


4 changed files:

- CMakeLists.txt
- discoSnpRAD/README.md
- discoSnpRAD/clustering_scripts/discoRAD_clustering.sh
- discoSnpRAD/run_discoSnpRad.sh


Changes:

=====================================
CMakeLists.txt
=====================================
@@ -10,7 +10,7 @@ cmake_minimum_required(VERSION 2.6)
 ################################################################################
 SET (gatb-tool_VERSION_MAJOR 2)
 SET (gatb-tool_VERSION_MINOR 4)
-SET (gatb-tool_VERSION_PATCH 2)
+SET (gatb-tool_VERSION_PATCH 3)
 
 IF (DEFINED MAJOR)
     SET (gatb-tool_VERSION_MAJOR ${MAJOR})
@@ -84,6 +84,7 @@ SET (CPACK_SOURCE_IGNORE_FILES
 )
 
 # For creating the BINARY package we include the files we want
+INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/discoSnpRAD        DESTINATION .)
 INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doc                DESTINATION .)
 INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test               DESTINATION .)
 INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts            DESTINATION . FILES_MATCHING REGEX ".*\\.(py|sh)$"  PATTERN "jenkins" EXCLUDE)


=====================================
discoSnpRAD/README.md
=====================================
@@ -11,22 +11,25 @@ Gauthier, J., Mouden, C.,  Suchan, T., Alvarez, N., Arrigo, N., Riou, C., Lemait
 
 ## Installation
 
-* discoSnp++
+* discoSnp++ (see [../README.md](../README.md))
 * `short_read_connector` must have been downloaded and installed (clustering task). [https://github.com/GATB/short_read_connector](https://github.com/GATB/short_read_connector)
 
 
 ## Usage
 
 ```
-./run_discoSnpRad.sh --fof read_file_of_files --src_path <directory> [discoSnp++ OPTIONS]
+./run_discoSnpRad.sh -r read_file_of_files -S -p myDiscoSnpRADresult [discoSnp++ OPTIONS]
 ```
 
-Clustering
+**Clustering option** (RAD-specific option):
+
 ```
--S|--src_path <directory>
-    **absolute** path to short_read_connector directory, containing the "short_read_connector.sh" file. 
+-S|--src [src_path]
+    performs clustering of variants with short_read_connector
+    src_path: **absolute** path to short_read_connector directory, containing the "short_read_connector.sh" file. 
     -Note1: short read connector must be compiled.
-    -Note2: with this option, discoSnpRad provide a vcf file containing SNPs and INDELS, clustered by locus
+    -Note2: if no value is given, it assumes short_read_connector.sh is in the PATH env variable.
+    -Note3: with this option, discoSnpRad outputs a vcf file containing the variants clustered by locus.
 ```
 
 All other options are described in [discoSnp++ README](../README.md). Note that many discoSNP++ parameters have here default values, specifically adapted to RAD-seq data.
@@ -39,8 +42,33 @@ To see all options:
 
 ## Output
 
-* a log file reminds all filtering steps applied and the name of the output .vcf file
-* a vcf file containing results of filtering and clustering
+When run with output prefix name `myDiscoSnpRADresult`, the main output file is :
+
+* `myDiscoSnpRADresult_[parameter_values]_clustered.vcf`: the final set of variants, with various information, including clustering per locus information (see VCF format below).
+* or `myDiscoSnpRADresult_[parameter_values].vcf` if no clustering was performed.
+
+Additionnally, several other files are output that can be usefull :
+
+* `myDiscoSnpRADresult_[parameter_values]_raw.fa`: the raw set of variants in fasta format, prior to any filtering and clustering steps.
+* `myDiscoSnpRADresult_[graph_parameter_values].h5`: the de Bruijn graph in h5 format (reusable with any GATB tool)
+* `myDiscoSnpRADresult_read_files_correspondance.txt`: the correspondence between read file names and IDs given as genotypes in the vcf
+* the standard output reminds all filtering steps applied and the name of the output .vcf file
+
+#### VCF format
+
+Each variant is described with: 
+
+* an ID: `ID` column, 
+
+* two alleles (`REF` and `ALT` columns), 
+
+* a quality value: `INFO` column, `Rk`, between 0 (bad) and 1 (best),
+
+* some clustering information: `INFO` field: with the locus id (`Cluster`) and its number of varying sites (`ClSize`),
+
+* and for each sample in the genotype columns (`G1`, `G2`,...): the inferred genotype (`0/0`, `0/1`, `1/1`or `./.`for missing value), the read depths (`RD` total, `AD`per allele), among others.
+
+  
 
 
 ## Content of this directory


=====================================
discoSnpRAD/clustering_scripts/discoRAD_clustering.sh
=====================================
@@ -22,7 +22,7 @@ echo "this script manages bubble clustering from a discofile.fa file, and the in
 echo " 1/ Remove variants with more than 95% missing genotypes and low rank (<0.4)"
 echo " 2/ Cluster variants per locus"
 echo " 3/ Format the variants in a vcf file with cluster information"
-echo "Usage: ./discoRAD_clustering.sh -f discofile -s SRC_directory/ -o output_file.vcf"
+echo "Usage: ./discoRAD_clustering.sh -f discofile -s SRC_path -o output_file.vcf"
 # echo "nb: all options are MANDATORY\n"
 echo "OPTIONS:"
 echo "\t -f: DiscoSnp fasta output containing coherent predictions"
@@ -44,7 +44,7 @@ while getopts "f:s:o:hw" opt; do
         ;;
 
         s)
-        short_read_connector_directory=$OPTARG
+        short_read_connector_path=$OPTARG
         ;;
 
         o)
@@ -66,7 +66,7 @@ if [[ -z "${rawdiscofile}" ]]; then
     echo "${red}-f is mandatory$reset" >&2
     exit
 fi
-if [[ -z "${short_read_connector_directory}" ]]; then
+if [[ -z "${short_read_connector_path}" ]]; then
     echo "${red}-s is mandatory$reset" >&2
     exit
 fi
@@ -80,7 +80,7 @@ EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 if [ -d "$EDIR/../../build/" ] ; then # VERSION SOURCE COMPILED
 BINDIR=$EDIR"/../../build/bin"
 else # VERSION BINARY
-BINDIR=$EDIR"/../../bin"
+BINDIR=$EDIR/../../bin
 fi
 rawdiscofile_base=$( basename "${rawdiscofile}" .fa)
 
@@ -150,7 +150,7 @@ fi
 #ls ${disco_simpler}.fa > ${disco_simpler}.fof
 
 # Compute sequence similarities
-cmdSRC="${short_read_connector_directory}/short_read_connector.sh -b ${disco_simpler}.fa -q ${disco_simpler}.fof -s 0 -k ${usedk} -a 1 -l -p ${disco_simpler}  1>&2 "
+cmdSRC="${short_read_connector_path} -b ${disco_simpler}.fa -q ${disco_simpler}.fof -s 0 -k ${usedk} -a 1 -l -p ${disco_simpler}  1>&2 "
 echo $green$cmdSRC$cyan
 if [[ "$wraith" == "false" ]]; then
     eval $cmdSRC


=====================================
discoSnpRAD/run_discoSnpRad.sh
=====================================
@@ -72,6 +72,7 @@ output_coverage_option=""
 genotyping="-genotype"
 remove=1
 verbose=1
+clustering="false"
 short_read_connector_path=""
 option_phase_variants=""
 #EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
@@ -106,17 +107,19 @@ function help {
     echo " ************"
     echo "run_discoSnpRad.sh, pipelining kissnp2 and kissreads and clustering per locus for calling SNPs and small indels from RAD-seq data without the need of a reference genome"
     echo "Version "$version
-    echo "Usage: ./run_discoSnpRad.sh --fof read_file_of_files --src_path <directory> [OPTIONS]"
+    echo "Usage: ./run_discoSnpRad.sh --fof read_file_of_files --src [src_path] [OPTIONS]"
     echo -e "MANDATORY"
     echo -e "\t -r|--fof <file name of a file of file(s)>"
     echo -e "\t\t The input read files indicated in a file of file(s)"
     echo -e "\t\t Example: -r bank.fof with bank.fof containing the two lines \n\t\t\t data_sample/reads_sequence1.fasta\n\t\t\t data_sample/reads_sequence2.fasta.gz"
     
     echo -e "\nOPTIONS"
-    echo -e "\t -S|--src_path <directory>"
-    echo -e "\t\t **absolute** path to short_read_connector directory, containing the \"short_read_connector.sh\" file. "
+    echo -e "\t -S|--src [src_path]"
+    echo -e "\t\t performs clustering of variants with short_read_connector"
+    echo -e "\t\t src_path: **absolute** path to short_read_connector directory, containing the \"short_read_connector.sh\" file. "
     echo -e "\t\t -Note1: short read connector must be compiled."
-    echo -e "\t\t -Note2: with this option, discoSnpRad provide a vcf file containing SNPs and INDELS, clustered by locus" 
+    echo -e "\t\t -Note2: if no value is given, it assumes short_read_connector.sh is in the PATH env variable."
+    echo -e "\t\t -Note3: with this option, discoSnpRad outputs a vcf file containing the variants clustered by locus" 
 
     echo -e "\t -k | --k_size value <int value>"
     echo -e "\t\t Set the length of used kmers. Must fit the compiled value."
@@ -184,15 +187,13 @@ while :; do
     -w)
         wraith="true"
         ;;
-    -S|--src_path)
+    -S|--src)
+    	clustering="true"
         if [ "$2" ] && [ ${2:0:1} != "-" ] ; then # checks that there exists a second value and its is not the start of the next option
             short_read_connector_path=$2
             shift
-        else
-            die 'ERROR: "'$1'" option requires a non-empty option argument.'
         fi
         ;;
-
     -a|--ambiguity_max_size)
         if [ "$2" ] && [ ${2:0:1} != "-" ] ; then # checks that there exists a second value and its is not the start of the next option
             max_ambigous_indel=$2
@@ -379,27 +380,40 @@ if [ -z "$read_sets" ]; then
     exit 1
 fi
 
-src_file="$short_read_connector_path/short_read_connector.sh"
-if [[ "$wraith" == "false" ]]; then
-    echo $yellow${src_file}$reset
-fi
+#Checks if clustering can be performed
 
-if [[ "$wraith" == "false" ]]; then
-    if [ -f "$src_file" ]; then
-        if [[ "$wraith" == "false" ]]; then
-            echo "${yellow}short_read_connector is $src_file$reset"
-        fi
-    else
-        if [[ "$wraith" == "false" ]]; then
-            echo -e "${red}\t\t\t**************************************************************************"
+if [[ "$clustering" == "true" ]]; then
+	# first tests the directory given by user if any
+	if [ -n "$short_read_connector_path" ]; then
+		src_file="$short_read_connector_path/short_read_connector.sh"
+    	if [ -f "$src_file" ]; then
+            echo "${yellow}short_read_connector path is $src_file$reset"
+    	else
+    		echo -e "${red}\t\t\t**************************************************************************"
             echo -e "\t\t\t** WARNING: I cannot find short_read_connector (-S). "
             echo -e "\t\t\t** $src_file does not exist"
             echo -e "\t\t\t** I will not cluster variants per RAD locus"
             echo -e "\t\t\t**************************************************************************"
             echo $reset
-        fi
+    		clustering="false"
+    	fi
+    else
+    	#then tests if src is in the PATH env variable
+    	src_file=$(command -v short_read_connector.sh)
+    	if [ -n "$src_file" ]; then
+    		echo "${yellow}short_read_connector path is $src_file$reset"
+    	else
+    		echo -e "${red}\t\t\t**************************************************************************"
+            echo -e "\t\t\t** WARNING: I cannot find short_read_connector in PATH. "
+            echo -e "\t\t\t** Try giving the absolute path of short_read_connector directory with option -S"
+            echo -e "\t\t\t** I will not cluster variants per RAD locus"
+            echo -e "\t\t\t**************************************************************************"
+            echo $reset
+    		clustering="false"
+    	fi
     fi
-fi 
+fi
+    		
 
 
 ######### CHECK THE k PARITY ##########
@@ -643,13 +657,13 @@ echo -e "\t######## CLUSTERING PER LOCUS AND/OR FORMATTING ###############"
 echo -e "\t###############################################################$reset"
 
 T="$(date +%s)"
-if [ -f "$src_file" ]; then
+if [[ "$clustering" == "true" ]]; then
     if [[ "$wraith" == "false" ]]; then
         echo "${yellow}Clustering and vcf formmatting$reset"
     fi
     final_output="${kissprefix}_clustered.vcf"
-    cmd="$EDIR/clustering_scripts/discoRAD_clustering.sh -f ${kissprefix}_raw.fa -s $short_read_connector_path -o ${final_output}"
-    echo $green$cmd$cyan
+    cmd="$EDIR/clustering_scripts/discoRAD_clustering.sh -f ${kissprefix}_raw.fa -s $src_file -o ${final_output}"
+    echo $green$cmd$cyan$reset
     if [[ "$wraith" == "false" ]]; then
         eval $cmd
     fi  
@@ -668,7 +682,7 @@ else
     fi
     final_output="${kissprefix}.vcf"
     cmd="python3 $EDIR/../scripts/create_filtered_vcf.py -i ${kissprefix}_raw.fa -o ${final_output} -m 0.95 -r 0.4"
-    echo $green$cmd$cyan
+    echo $green$cmd$cyan$reset
     if [[ "$wraith" == "false" ]]; then
         eval $cmd
     fi



View it on GitLab: https://salsa.debian.org/med-team/discosnp/commit/5d31fa2928d45f3c3b84b2afc3b28c7120176bcd

-- 
View it on GitLab: https://salsa.debian.org/med-team/discosnp/commit/5d31fa2928d45f3c3b84b2afc3b28c7120176bcd
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200127/232000be/attachment-0001.html>