[med-svn] [sga] 02/04: Add some further test script

Andreas Tille tille at debian.org
Wed Sep 9 09:11:05 UTC 2015


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch debian
in repository sga.

commit 40a06c7f073f2eb74f13bf95ab8a92e22e16a04f
Author: Andreas Tille <tille at debian.org>
Date:   Thu Jul 9 14:40:17 2015 +0200

    Add some further test script
---
 debian/test/sga-ecoli-miseq.sh | 125 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/debian/test/sga-ecoli-miseq.sh b/debian/test/sga-ecoli-miseq.sh
new file mode 100755
index 0000000..96a4e63
--- /dev/null
+++ b/debian/test/sga-ecoli-miseq.sh
@@ -0,0 +1,125 @@
+#! /bin/bash -x
+
+#
+# Example assembly of 150bp E. coli reads
+#
+
+# Download the E. coli FASTQ files from Illumina's website
+#wget ftp://webdata:webdata@ussd-ftp.illumina.com/Data/SequencingRuns/MG1655/MiSeq_Ecoli_MG1655_110721_PF_R1.fastq.gz
+#wget ftp://webdata:webdata@ussd-ftp.illumina.com/Data/SequencingRuns/MG1655/MiSeq_Ecoli_MG1655_110721_PF_R2.fastq.gz
+
+IN1=MiSeq_Ecoli_MG1655_110721_PF_R1.fastq.gz
+IN2=MiSeq_Ecoli_MG1655_110721_PF_R2.fastq.gz
+
+#
+# Parameters
+#
+
+# Program paths
+SGA_BIN=sga
+BWA_BIN=bwa
+SAMTOOLS_BIN=samtools
+BAM2DE_BIN=sga-bam2de
+ASTAT_BIN=sga-astat
+DISTANCE_EST=DistanceEst
+
+# The number of threads to use
+CPU=8
+
+# Correction k-mer 
+CORRECTION_K=41
+
+# The minimum overlap to use when computing the graph.
+# The final assembly can be performed with this overlap or greater
+MIN_OVERLAP=85
+
+# The overlap value to use for the final assembly
+ASSEMBLE_OVERLAP=111
+
+# Branch trim length
+TRIM_LENGTH=400
+
+# The minimum length of contigs to include in a scaffold
+MIN_CONTIG_LENGTH=200
+
+# The minimum number of reads pairs required to link two contigs
+MIN_PAIRS=10
+
+#
+# Dependency checks
+#
+
+# Check the required programs are installed and executable
+prog_list="$SGA_BIN $BWA_BIN $SAMTOOLS_BIN $BAM2DE_BIN $DISTANCE_EST $ASTAT_BIN"
+for prog in $prog_list; do
+    hash $prog 2>/dev/null || { echo "Error $prog not found. Please place $prog on your PATH or update the *_BIN variables in this script"; exit 1; }
+done 
+
+# Check the files are found
+file_list="$IN1 $IN2"
+for input in $file_list; do
+    if [ ! -f $input ]; then
+        echo "Error input file $input not found"; exit 1;
+    fi
+done
+
+#
+# Preprocessing
+#
+
+# Preprocess the data to remove ambiguous basecalls
+$SGA_BIN preprocess --pe-mode 1 -o reads.pp.fastq $IN1 $IN2
+
+#
+# Error Correction
+#
+
+# Build the index that will be used for error correction
+# As the error corrector does not require the reverse BWT, suppress
+# construction of the reversed index
+$SGA_BIN index -a ropebwt -t $CPU --no-reverse reads.pp.fastq
+
+# Perform k-mer based error correction.
+# The k-mer cutoff parameter is learned automatically.
+$SGA_BIN correct -k $CORRECTION_K --learn -t $CPU -o reads.ec.fastq reads.pp.fastq
+
+#
+# Primary (contig) assembly
+#
+
+# Index the corrected data.
+$SGA_BIN index -a ropebwt -t $CPU reads.ec.fastq
+
+# Remove exact-match duplicates and reads with low-frequency k-mers
+$SGA_BIN filter -x 2 -t $CPU reads.ec.fastq
+
+# Compute the structure of the string graph
+$SGA_BIN overlap -m $MIN_OVERLAP -t $CPU reads.ec.filter.pass.fa
+
+# Perform the contig assembly
+$SGA_BIN assemble -m $ASSEMBLE_OVERLAP --min-branch-length $TRIM_LENGTH -o primary reads.ec.filter.pass.asqg.gz
+
+#
+# Scaffolding
+#
+
+PRIMARY_CONTIGS=primary-contigs.fa
+PRIMARY_GRAPH=primary-graph.asqg.gz
+
+# Align the reads to the contigs
+$BWA_BIN index $PRIMARY_CONTIGS
+$BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN1 > $IN1.sai
+$BWA_BIN aln -t $CPU $PRIMARY_CONTIGS $IN2 > $IN2.sai
+$BWA_BIN sampe $PRIMARY_CONTIGS $IN1.sai $IN2.sai $IN1 $IN2 | $SAMTOOLS_BIN view -Sb - > libPE.bam
+
+# Convert the BAM file into a set of contig-contig distance estimates
+$BAM2DE_BIN -n $MIN_PAIRS -m $MIN_CONTIG_LENGTH --prefix libPE libPE.bam
+
+# Compute copy number estimates of the contigs
+$ASTAT_BIN -m $MIN_CONTIG_LENGTH libPE.bam > libPE.astat
+
+# Build the scaffolds
+$SGA_BIN scaffold -m $MIN_CONTIG_LENGTH -a libPE.astat -o scaffolds.scaf --pe libPE.de $PRIMARY_CONTIGS
+
+# Convert the scaffolds to FASTA format
+$SGA_BIN scaffold2fasta --use-overlap --write-unplaced -m $MIN_CONTIG_LENGTH -a $PRIMARY_GRAPH -o sga-scaffolds.fa scaffolds.scaf

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/sga.git



More information about the debian-med-commit mailing list