[med-svn] [Git][med-team/kma][upstream] New upstream version 1.4.14
Lance Lin (@linqigang)
gitlab at salsa.debian.org
Wed Aug 23 14:29:04 BST 2023
Lance Lin pushed to branch upstream at Debian Med / kma
Commits:
149fa09e by Lance Lin at 2023-08-22T22:12:39+07:00
New upstream version 1.4.14
- - - - -
29 changed files:
- + .github/workflows/conda-linux-release.yml
- + .github/workflows/conda-osx-release.yml
- + .github/workflows/release.yml
- + .idea/.gitignore
- + .idea/kma.iml
- + .idea/misc.xml
- + .idea/modules.xml
- + .idea/vcs.xml
- README.md
- + bitbucket-pipelines.yml
- + conda/build.sh
- + conda/meta.yaml
- dist.c
- + envs/osx-build.yml
- kma.c
- kmeranker.c
- kmeranker.h
- kmers.c
- runinput.c
- runinput.h
- runkma.c
- savekmers.c
- seqparse.c
- sparse.c
- sparse.h
- spltdb.c
- stdstat.h
- trim.c
- version.h
Changes:
=====================================
.github/workflows/conda-linux-release.yml
=====================================
@@ -0,0 +1,19 @@
+name: conda-linux-release
+on:
+ push:
+ # Sequence of patterns matched against refs/tags
+ tags:
+ - '*' # Push events to matching *, i.e. 1.0, 20.15.10
+jobs:
+ conda-linux:
+ name: Linux Upload
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout at v1
+ - name: publish-to-conda-linux
+ uses: BEFH/anaconda-publish at v1.5.1
+ with:
+ subDir: 'conda'
+ channels: 'conda-forge -c bioconda -c defaults'
+ AnacondaToken: ${{ secrets.GENOMICEPIDEMIOLOGY_CONDA_AUTH_TOKEN }}
+ publish: true
=====================================
.github/workflows/conda-osx-release.yml
=====================================
@@ -0,0 +1,43 @@
+name: conda-osx-release
+on:
+ push:
+ # Sequence of patterns matched against refs/tags
+ tags:
+ - '*' # Push events to matching *, i.e. 1.0, 20.15.10
+jobs:
+ conda-osx:
+ name: ${{ matrix.os }}
+ runs-on: ${{ matrix.os }}
+ defaults:
+ run:
+ shell: bash -l {0}
+
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [macos-latest]
+
+ steps:
+ - name: checkout repository
+ uses: actions/checkout at v2
+
+ - name: create environment with mamba
+ uses: conda-incubator/setup-miniconda at v2
+ with:
+ mamba-version: "*"
+ channels: conda-forge,bioconda,defaults
+ auto-activate-base: true
+ activate-environment: osx-build
+ environment-file: envs/osx-build.yml
+
+ - name: check solution
+ run: |
+ mamba env export
+
+ - name: build conda package
+ run: |
+ conda mambabuild conda -c conda-forge --output-folder .
+
+ - name: upload conda package
+ run: |
+ anaconda -t ${{ secrets.GENOMICEPIDEMIOLOGY_CONDA_AUTH_TOKEN }} upload --label main osx-64/*.tar.bz2
=====================================
.github/workflows/release.yml
=====================================
@@ -0,0 +1,29 @@
+on:
+ push:
+ # Sequence of patterns matched against refs/tags
+ tags:
+ - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10
+
+name: Create Release
+
+jobs:
+ auto-tag-release:
+ name: Create Release
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout at v2
+ - name: Create Release
+ id: create_release
+ uses: actions/create-release at v1
+ env:
+ GITHUB_TOKEN: ${{ secrets.PERSONAL_TOKEN }} # This token is provided by Actions, you do not need to create your own token
+ with:
+ tag_name: ${{ github.ref }}
+ release_name: Release ${{ github.ref }}
+ body: |
+ Changes in this Release
+ - First Change
+ - Second Change
+ draft: false
+ prerelease: false
\ No newline at end of file
=====================================
.idea/.gitignore
=====================================
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
=====================================
.idea/kma.iml
=====================================
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+ <component name="NewModuleRootManager" inherit-compiler-output="true">
+ <exclude-output />
+ <content url="file://$MODULE_DIR$" />
+ <orderEntry type="inheritedJdk" />
+ <orderEntry type="sourceFolder" forTests="false" />
+ </component>
+</module>
\ No newline at end of file
=====================================
.idea/misc.xml
=====================================
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="ProjectRootManager">
+ <output url="file://$PROJECT_DIR$/out" />
+ </component>
+</project>
\ No newline at end of file
=====================================
.idea/modules.xml
=====================================
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="ProjectModuleManager">
+ <modules>
+ <module fileurl="file://$PROJECT_DIR$/.idea/kma.iml" filepath="$PROJECT_DIR$/.idea/kma.iml" />
+ </modules>
+ </component>
+</project>
\ No newline at end of file
=====================================
.idea/vcs.xml
=====================================
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="VcsDirectoryMappings">
+ <mapping directory="" vcs="Git" />
+ </component>
+</project>
\ No newline at end of file
=====================================
README.md
=====================================
@@ -1,3 +1,4 @@
+[![install with bioconda](https://img.shields.io/badge/install%20with-genomicepidemiology-brightgreen.svg?style=flat)](https://anaconda.org/genomicepidemiology/kma)
[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/kma/README.html)
# Getting Started #
=====================================
bitbucket-pipelines.yml
=====================================
@@ -0,0 +1,8 @@
+clone:
+ depth: full
+pipelines:
+ custom:
+ mirror-to-github:
+ - step:
+ script:
+ - git push --mirror git at github.com:genomicepidemiology/kma
\ No newline at end of file
=====================================
conda/build.sh
=====================================
@@ -0,0 +1,7 @@
+BINARIES="kma kma_index kma_shm kma_update"
+make CFLAGS="-w -O3 -I$PREFIX/include -L$PREFIX/lib"
+
+mkdir -p ${PREFIX}/bin
+cp $BINARIES $PREFIX/bin
+mkdir -p $PREFIX/doc/kma
+cp README.md $PREFIX/doc/kma/
\ No newline at end of file
=====================================
conda/meta.yaml
=====================================
@@ -0,0 +1,30 @@
+package:
+ name: kma
+ version: 1.4.9
+
+source:
+ url: https://bitbucket.org/genomicepidemiology/kma/get/1.4.9.tar.gz
+
+build:
+ number: 0
+
+requirements:
+ build:
+ - make
+ - {{ compiler('c') }}
+ host:
+ - zlib
+ run:
+ - zlib
+
+test:
+ commands:
+ - kma -h | grep KMA >/dev/null
+
+about:
+ home: https://bitbucket.org/genomicepidemiology/kma
+ summary: 'KMA is mapping a method designed to map raw reads directly against redundant databases, in an ultra-fast manner using seed and extend.'
+ license: Apache-2.0
+extra:
+ identifiers:
+ - doi:10.1186/s12859-018-2336-6
\ No newline at end of file
=====================================
dist.c
=====================================
@@ -80,8 +80,8 @@ HashMapKMA * loadValues(const char *filename) {
dest->exist = smalloc(size);
check = fread(dest->exist, 1, size, file);
if(check != size) {
- free(dest);
free(dest->exist);
+ free(dest);
fclose(file);
return 0;
}
@@ -110,9 +110,9 @@ HashMapKMA * loadValues(const char *filename) {
dest->values = smalloc(size);
check = fread(dest->values, 1, size, file);
if(check != size) {
- free(dest);
free(dest->exist);
free(dest->values);
+ free(dest);
fclose(file);
return 0;
}
@@ -141,9 +141,9 @@ HashMapKMA * loadValues(const char *filename) {
dest->exist = smalloc(size);
check = fread(dest->exist, 1, size, file);
if(check != size) {
- free(dest);
free(dest->exist);
free(dest->values);
+ free(dest);
fclose(file);
return 0;
}
=====================================
envs/osx-build.yml
=====================================
@@ -0,0 +1,9 @@
+name: osx-build
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - mamba
+ - python
+ - boa
+ - anaconda-client
\ No newline at end of file
=====================================
kma.c
=====================================
@@ -192,6 +192,7 @@ static void helpMessage(int exitStatus) {
fprintf(out, "#\n# Trimming:\n");
fprintf(out, "# %16s\t%-32s\t%s\n", "-mp", "Minimum phred score", "20");
+ fprintf(out, "# %16s\t%-32s\t%s\n", "-mi", "Minimum internal phred score", "0");
fprintf(out, "# %16s\t%-32s\t%s\n", "-eq", "Minimum avg. quality score", "0");
fprintf(out, "# %16s\t%-32s\t%s\n", "-5p", "Trim 5 prime", "0");
fprintf(out, "# %16s\t%-32s\t%s\n", "-3p", "Trim 3 prime", "0");
@@ -230,11 +231,11 @@ int kma_main(int argc, char *argv[]) {
0.0000000000398107, 0.0000000000316228, 0.0000000000251189, 0.0000000000199526, 0.0000000000158489, 0.0000000000125893, 0.0000000000100000, 0.0000000000079433,
0.0000000000063096, 0.0000000000050119, 0.0000000000039811, 0.0000000000031623, 0.0000000000025119, 0.0000000000019953, 0.0000000000015849, 0.0000000000012589,
0.0000000000010000, 0.0000000000007943, 0.0000000000006310, 0.0000000000005012, 0.0000000000003981, 0.0000000000003162, 0.0000000000002512, 0.0000000000001995};
- static int minPhred, minQ, fiveClip, threeClip, ConClave, mem_mode;
- static int fileCounter, fileCounter_PE, fileCounter_INT, Ts, Tv, minlen;
+ static int minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, maxlen;
+ static int fileCounter, fileCounter_PE, fileCounter_INT, Ts, Tv, mem_mode;
static int extendedFeatures, spltDB, thread_num, kmersize, targetNum, mq;
static int ref_fsa, print_matrix, print_all, sam, vcf, Mt1, bcd, one2one;
- static int sparse_run, ts, maxlen, **d, status = 0;
+ static int ConClave, sparse_run, ts, **d, status = 0;
static unsigned xml, nc, nf, shm, exhaustive, verbose;
static long unsigned tsv;
static char *outputfilename, *templatefilename, **templatefilenames;
@@ -271,6 +272,7 @@ int kma_main(int argc, char *argv[]) {
spltDB = 0;
extendedFeatures = 0;
minPhred = 20;
+ minmaskQ = 0;
minQ = 0;
fiveClip = 0;
threeClip = 0;
@@ -580,6 +582,15 @@ int kma_main(int argc, char *argv[]) {
exit(1);
}
}
+ } else if(strcmp(argv[args], "-mi") == 0) {
+ ++args;
+ if(args < argc) {
+ minmaskQ = strtoul(argv[args], &exeBasic, 10);
+ if(*exeBasic != 0) {
+ fprintf(stderr, "# Invalid internal minimum phred score parsed\n");
+ exit(1);
+ }
+ }
} else if(strcmp(argv[args], "-eq") == 0) {
++args;
if(args < argc) {
@@ -1430,8 +1441,11 @@ int kma_main(int argc, char *argv[]) {
free(inputfiles_INT);
fprintf(stderr, "Interleaved information is not considered in Sparse mode.\n");
}
+ if(minPhred < minmaskQ) {
+ minPhred = minmaskQ;
+ }
- run_input_sparse(templates, inputfiles, fileCounter, minPhred, minQ, fiveClip, threeClip, kmersize, to2Bit, prob, ioStream);
+ run_input_sparse(templates, inputfiles, fileCounter, minPhred, minmaskQ, minQ, fiveClip, threeClip, kmersize, to2Bit, prob, ioStream);
hashMapKMA_destroy(templates);
free(myTemplatefilename);
} else {
@@ -1449,20 +1463,23 @@ int kma_main(int argc, char *argv[]) {
myTemplatefilename = 0;
}
totFrags = 0;
+ if(minPhred < minmaskQ) {
+ minPhred = minmaskQ;
+ }
/* SE */
if(fileCounter > 0) {
- totFrags += run_input(inputfiles, fileCounter, minPhred, minQ, fiveClip, threeClip, minlen, maxlen, to2Bit, prob, ioStream);
+ totFrags += run_input(inputfiles, fileCounter, minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, maxlen, to2Bit, prob, ioStream);
}
/* PE */
if(fileCounter_PE > 0) {
- totFrags += run_input_PE(inputfiles_PE, fileCounter_PE, minPhred, minQ, fiveClip, threeClip, minlen, to2Bit, prob, ioStream);
+ totFrags += run_input_PE(inputfiles_PE, fileCounter_PE, minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, to2Bit, prob, ioStream);
}
/* INT */
if(fileCounter_INT > 0) {
- totFrags += run_input_INT(inputfiles_INT, fileCounter_INT, minPhred, minQ, fiveClip, threeClip, minlen, to2Bit, prob, ioStream);
+ totFrags += run_input_INT(inputfiles_INT, fileCounter_INT, minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, to2Bit, prob, ioStream);
}
fprintf(stderr, "#\n# Total number of query fragment after trimming:\t%lu\n", totFrags);
=====================================
kmeranker.c
=====================================
@@ -22,7 +22,7 @@
#include "kmeranker.h"
#include "penalties.h"
-KmerAnker * (*getChainTemplates)(KmerAnker*, const Penalties*, const int*, const int, const int, int*, int*, int*, char*) = &getBestChainTemplates;
+KmerAnker * (*getChainTemplates)(KmerAnker*, const Penalties*, const int*, const int, const int, const int, int*, int*, int*, char*) = &getBestChainTemplates;
int (*kmerAnkerScore)(KmerAnker*) = &ankerScore;
const int (*testExtension)(const int, const int, const int) = &testExtensionScore;
const int (*proxiTestBest)(const double, const int, const int, const int, const int) = &proxiTestBestScore;
@@ -80,7 +80,7 @@ const int mrchain(int *bestTemaples, const int *template_lengths, const int q_le
return 1;
}
-KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, int *bests, int *Score, int *extendScore, char *include) {
+KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, const int mlen, int *bests, int *Score, int *extendScore, char *include) {
/* get set of best templates and silences the chain, except for the initial anker */
int i, j, template, score, tmpScore, bestScore, Wl, W1, U, M, MM, Ms, MMs;
@@ -160,6 +160,8 @@ KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, cons
if(gaps == -kmersize) {
/* Perfect match */
score += node->weight - (kmersize - 1) * M;
+ } else if(gaps == 0) {
+ score += node->weight + MM;
} else if(0 < gaps) {
/* mismatch or insersion */
if(gaps <= 2) {
@@ -178,9 +180,12 @@ KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, cons
} else {
score += node->weight + (W1 + (gaps - 1) * U);
}
- } else {
+ } else if(mlen != kmersize) {
/* snp */
- score += node->weight + (gaps + 1) * M + MM;
+ score += node->weight + gaps * M + MM;
+ } else {
+ /* indel */
+ score += node->weight + gaps * M - (gaps + 1) * U + W1;
}
/* mark as used */
@@ -227,7 +232,7 @@ KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, cons
return j ? prev : 0;
}
-KmerAnker * getProxiChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, int *bests, int *Score, int *extendScore, char *include) {
+KmerAnker * getProxiChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, const int mlen, int *bests, int *Score, int *extendScore, char *include) {
/* get set of best templates and silences the chain, except for the initial anker */
static long unsigned *softProxi = 0;
@@ -290,6 +295,8 @@ KmerAnker * getProxiChainTemplates(KmerAnker *src, const Penalties *rewards, con
if(gaps == -kmersize) {
/* Perfect match */
score += node->weight - (kmersize - 1) * M;
+ } else if(gaps == 0) {
+ score += node->weight + MM;
} else if(0 < gaps) {
/* mismatch or insersion */
if(gaps <= 2) {
@@ -308,9 +315,12 @@ KmerAnker * getProxiChainTemplates(KmerAnker *src, const Penalties *rewards, con
} else {
score += node->weight + (W1 + (gaps - 1) * U);
}
- } else {
+ } else if(mlen != kmersize) {
/* snp */
- score += node->weight + (gaps + 1) * M + MM;
+ score += node->weight + gaps * M + MM;
+ } else {
+ /* indel */
+ score += node->weight + gaps * M - (gaps + 1) * U + W1;
}
/* mark as used */
=====================================
kmeranker.h
=====================================
@@ -34,7 +34,7 @@ struct kmerAnker {
};
#endif
-extern KmerAnker * (*getChainTemplates)(KmerAnker*, const Penalties*, const int*, const int, const int, int*, int*, int*, char*);
+extern KmerAnker * (*getChainTemplates)(KmerAnker*, const Penalties*, const int*, const int, const int, const int, int*, int*, int*, char*);
extern int (*kmerAnkerScore)(KmerAnker*);
extern const int (*testExtension)(const int, const int, const int);
extern const int (*proxiTestBest)(const double, const int, const int, const int, const int);
@@ -47,8 +47,8 @@ const int testExtensionScoreLen(const int q_len, const int t_len, const int best
const int proxiTestBestScore(const double proxiScore, const int score, const int q_len, const int t_len, const int best_len);
const int proxiTestBestScoreLen(const double proxiScore, const int score, const int q_len, const int t_len, const int best_len);
const int mrchain(int *bestTemaples, const int *template_lengths, const int q_len, const int maplen);
-KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, int *bests, int *Score, int *extendScore, char *include);
-KmerAnker * getProxiChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, int *bests, int *Score, int *extendScore, char *include);
+KmerAnker * getBestChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, const int mlen, int *bests, int *Score, int *extendScore, char *include);
+KmerAnker * getProxiChainTemplates(KmerAnker *src, const Penalties *rewards, const int *template_lengths, const int q_len, const int kmersize, const int mlen, int *bests, int *Score, int *extendScore, char *include);
KmerAnker * pruneAnkers(KmerAnker *V_score, int kmersize);
KmerAnker * getBestAnkerScore(KmerAnker **src, unsigned *ties, const int *template_lengths);
KmerAnker * getBestAnkerScoreLen(KmerAnker **src, unsigned *ties, const int *template_lengths);
=====================================
kmers.c
=====================================
@@ -147,7 +147,7 @@ int save_kmers_batch(char *templatefilename, char *exePrev, unsigned shm, int th
getF((int *)(&minFrac), (int *)(softProxi), 0, 0, 0);
ankerAndClean((int *)(&minFrac), (int *)(softProxi), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
ankerAndClean_MEM((int *)(&minFrac), (int *)(softProxi), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
- getProxiChainTemplates(0, (const Penalties *)(&minFrac), (int *)(softProxi), 0, 0, 0, 0, 0, 0);
+ getProxiChainTemplates(0, (const Penalties *)(&minFrac), (int *)(softProxi), 0, 0, 0, 0, 0, 0, 0);
}
template_lengths = 0;
if(kmerScan == &save_kmers_HMM || kmerScan == &save_kmers_chain || kmerScan == &save_kmers_sparse_chain) {
=====================================
runinput.c
=====================================
@@ -29,7 +29,29 @@
void (*printFsa_ptr)(Qseqs*, Qseqs*, Qseqs*, CompDNA*, FILE*) = &printFsa;
void (*printFsa_pair_ptr)(Qseqs*, Qseqs*, Qseqs*, Qseqs*, Qseqs*, Qseqs*, CompDNA*, FILE*) = &printFsa_pair;
-long unsigned run_input(char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int minlen, int maxlen, char *trans, const double *prob, FILE *out) {
+unsigned hardmaskQ(unsigned char *seq, unsigned char *qual, int len, const int phredScale, int minQ) {
+
+ unsigned n;
+
+ if(!len || !minQ) {
+ return 0;
+ }
+
+ minQ += phredScale;
+ n = 0;
+ --qual;
+ do {
+ if(*++qual < minQ) {
+ *seq = 4;
+ ++n;
+ }
+ ++seq;
+ } while(--len);
+
+ return n;
+}
+
+long unsigned run_input(char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int minlen, int maxlen, char *trans, const double *prob, FILE *out) {
int fileCounter, phredScale, phredCut, start, end;
unsigned FASTQ;
@@ -89,7 +111,7 @@ long unsigned run_input(char **inputfiles, int fileCount, int minPhred, int minQ
qual->len = end - start;
/* print */
- if(minlen <= qseq->len && minQ <= eQual(seq + start, qseq->len, minQ, prob - phredScale)) {
+ if(minlen <= qseq->len && qseq->len != hardmaskQ(qseq->seq + start, seq + start, qseq->len, phredScale, minmaskQ) && minQ <= eQual(seq + start, qseq->len, minQ, prob - phredScale)) {
/* dump seq */
qseq->seq += start;
qual->seq += start;
@@ -144,7 +166,7 @@ long unsigned run_input(char **inputfiles, int fileCount, int minPhred, int minQ
return count;
}
-long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out) {
+long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out) {
int fileCounter, phredScale, phredCut, start, start2, end;
unsigned FASTQ, FASTQ2;
@@ -185,6 +207,7 @@ long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int m
} else {
fprintf(stderr, "Inputfiles:\t%s %s\nAre in different format.\n", inputfiles[fileCounter-1], filename);
FASTQ = 0;
+ errno = 1;
}
/* parse the file */
@@ -222,6 +245,10 @@ long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int m
*/
qseq->len = end - start;
qual->len = end - start;
+ if(qseq->len == hardmaskQ(qseq->seq + start, seq + start, qseq->len, phredScale, minmaskQ)) {
+ qseq->len = 0;
+ qual->len = 0;
+ }
eq1 = eQual(seq + start, qseq->len, minQ, prob - phredScale);
/* trim reverse */
@@ -245,6 +272,10 @@ long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int m
*/
qseq2->len = end - start2;
qual2->len = end - start2;
+ if(qseq2->len == hardmaskQ(qseq2->seq + start, seq + start, qseq2->len, phredScale, minmaskQ)) {
+ qseq2->len = 0;
+ qual2->len = 0;
+ }
eq2 = eQual(seq + start2, qseq2->len, minQ, prob - phredScale);
/* print */
@@ -344,7 +375,7 @@ long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int m
return count;
}
-long unsigned run_input_INT(char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out) {
+long unsigned run_input_INT(char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out) {
int fileCounter, phredScale, phredCut, start, start2, end;
unsigned FASTQ;
@@ -409,6 +440,10 @@ long unsigned run_input_INT(char **inputfiles, int fileCount, int minPhred, int
*/
qseq->len = end - start;
qual->len = end - start;
+ if(qseq->len == hardmaskQ(qseq->seq + start, seq + start, qseq->len, phredScale, minmaskQ)) {
+ qseq->len = 0;
+ qual->len = 0;
+ }
eq1 = eQual(seq + start, qseq->len, minQ, prob - phredScale);
/* trim reverse */
@@ -431,6 +466,10 @@ long unsigned run_input_INT(char **inputfiles, int fileCount, int minPhred, int
*/
qseq2->len = end - start2;
qual2->len = end - start2;
+ if(qseq2->len == hardmaskQ(qseq2->seq + start, seq + start, qseq2->len, phredScale, minmaskQ)) {
+ qseq2->len = 0;
+ qual2->len = 0;
+ }
eq2 = eQual(seq + start2, qseq2->len, minQ, prob - phredScale);
/* print */
=====================================
runinput.h
=====================================
@@ -23,9 +23,10 @@
/* pointers determining how to deliver the input */
extern void (*printFsa_ptr)(Qseqs*, Qseqs*, Qseqs*, CompDNA*, FILE*);
extern void (*printFsa_pair_ptr)(Qseqs*, Qseqs*, Qseqs*, Qseqs*, Qseqs*, Qseqs*, CompDNA*, FILE*);
-long unsigned run_input(char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int minlen, int maxlen, char *trans, const double *prob, FILE *out);
-long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out);
-long unsigned run_input_INT(char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out);
+unsigned hardmaskQ(unsigned char *seq, unsigned char *qual, int len, const int phredScale, int minQ);
+long unsigned run_input(char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int minlen, int maxlen, char *trans, const double *prob, FILE *out);
+long unsigned run_input_PE(char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out);
+long unsigned run_input_INT(char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int minlen, char *trans, const double *prob, FILE *out);
void bootFsa(Qseqs *header, Qseqs *qseq, Qseqs *qual, CompDNA *compressor, FILE *out);
void printFsa(Qseqs *header, Qseqs *qseq, Qseqs *qual, CompDNA *compressor, FILE *out);
void printFsa_pair(Qseqs *header, Qseqs *qseq, Qseqs *qual, Qseqs *header_r, Qseqs *qseq_r, Qseqs *qual_r, CompDNA *compressor, FILE *out);
=====================================
runkma.c
=====================================
@@ -152,7 +152,7 @@ int runKMA(char *templatefilename, char *outputfilename, char *exePrev, int ConC
AssemInfo *matrix;
AlnPoints *points;
NWmat *NWmatrices;
- Assemble_thread *threads, *thread;
+ Assemble_thread *threads, *thread, *next;
Aln_thread *alnThreads, *alnThread;
HashMapCCI **templates_index;
@@ -650,7 +650,12 @@ int runKMA(char *templatefilename, char *outputfilename, char *exePrev, int ConC
aligned_assem->q = smalloc(aligned_assem->size);
/* allocate matrcies for NW */
- for(thread = threads; thread != 0; thread = thread->next) {
+ i = 1;
+ thread = threads;
+ threads = 0;
+ while(thread) {
+ next = thread->next;
+
/* get remaining thread info */
aligned = smalloc(sizeof(Aln));
gap_align = smalloc(sizeof(Aln));
@@ -672,9 +677,36 @@ int runKMA(char *templatefilename, char *outputfilename, char *exePrev, int ConC
if((errno = pthread_create(&thread->id, NULL, assembly_KMA_Ptr, thread))) {
fprintf(stderr, "Error: %d (%s)\n", errno, strerror(errno));
fprintf(stderr, "Will continue with %d threads.\n", i);
- threads = thread->next;
- free(thread);
+
+ /* free tracebacks */
+ free(aligned->t);
+ free(aligned->s);
+ free(aligned->q);
+ free(aligned);
+ free(gap_align->t);
+ free(gap_align->s);
+ free(gap_align->q);
+ free(gap_align);
+
+ /* free remaining nodes */
+ while(thread) {
+ next = thread->next;
+ NWmatrices = thread->NWmatrices;
+ free(NWmatrices->E);
+ free(NWmatrices->D[0]);
+ free(NWmatrices->P[0]);
+ free(NWmatrices);
+ destroyQseqs(thread->qseq);
+ destroyQseqs(thread->header);
+ seedPoint_free(thread->points);
+ thread = next;
+ }
+ } else {
+ thread->next = threads;
+ threads = thread;
+ ++i;
}
+ thread = next;
}
/* start main thread */
@@ -835,7 +867,7 @@ int runKMA(char *templatefilename, char *outputfilename, char *exePrev, int ConC
assembly_KMA_Ptr(thread);
for(thread = threads; thread != 0; thread = thread->next) {
/* join thread */
- if((errno = pthread_join(thread->id, NULL))) {
+ if(thread->id && (errno = pthread_join(thread->id, NULL))) {
ERROR();
}
}
=====================================
savekmers.c
=====================================
@@ -546,12 +546,17 @@ int get_kmers_for_pair(const HashMapKMA *templates, const Penalties *rewards, in
++W1s;
Us += (gaps -1);
}
- } else {
- Ms += (gaps - 1);
+ } else if(mlen != kmersize) {
+ Ms += gaps;
++MMs;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ Ms += gaps;
+ ++W1s;
+ Us += (kmersize - gaps);
}
HIT = j;
@@ -568,20 +573,19 @@ int get_kmers_for_pair(const HashMapKMA *templates, const Penalties *rewards, in
}
HIT = j - 1;
last = values;
- score = kmersize * M;
values_s = (short unsigned *) values;
n = SU ? *values_s : *values;
l = n + 1;
while(--l) {
template = SU ? *++values_s : *++values;
- if(Scores[template] != 0) {
+ if(include[template]) {
gaps = HIT - extendScore[template];
if(gaps == 0) {
/* match */
- Scores[template] += M;
+ score = M;
} else if(gaps == kmersize) {
/* snp */
- Scores[template] += score + MM;
+ score = kmersize * M + MM;
} else if(kmersize < gaps) {
/* mismatch or insersion */
gaps -= (kmersize - 1); /* adjust for consecutive k-mer mismatches */
@@ -597,18 +601,22 @@ int get_kmers_for_pair(const HashMapKMA *templates, const Penalties *rewards, in
/* evaluate best option */
if((W1 + (gaps - 1) * U) <= (mm * MM + m * M)) {
- Scores[template] += score + (mm * MM + m * M);
+ score = kmersize * M + (mm * MM + m * M);
} else {
- Scores[template] += score + (W1 + (gaps - 1) * U);
+ score = kmersize * M + (W1 + (gaps - 1) * U);
}
- } else {
- Scores[template] += (gaps + 1) * M + MM;
+ } else if(mlen != kmersize) {
+ score = gaps * M + MM;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ score = gaps * M + (kmersize - gaps) * U + W1;
}
+ Score[template] += score;
} else {
- Scores[template] = score;
+ Scores[template] = kmersize * M;
if(include[template] == 0) {
include[template] = 1;
bests[++*bests] = template;
@@ -1107,12 +1115,17 @@ int get_kmers_for_pair_pseoudoSparse(const HashMapKMA *templates, const Penaltie
++W1s;
Us += (gaps -1);
}
- } else {
- Ms += (gaps - 1);
+ } else if(mlen != kmersize) {
+ Ms += gaps;
++MMs;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ Ms += gaps;
+ ++W1s;
+ Us += (kmersize - gaps);
}
HIT = j;
@@ -1129,20 +1142,19 @@ int get_kmers_for_pair_pseoudoSparse(const HashMapKMA *templates, const Penaltie
}
HIT = j - 1;
last = values;
- score = kmersize * M;
values_s = (short unsigned *) values;
n = SU ? *values_s : *values;
l = n + 1;
while(--l) {
template = SU ? *++values_s : *++values;
- if(Scores[template] != 0) {
+ if(include[template]) {
gaps = HIT - extendScore[template];
if(gaps == 0) {
/* match */
- Scores[template] += M;
+ score = M;
} else if(gaps == kmersize) {
/* snp */
- Scores[template] += score + MM;
+ score = kmersize * M + MM;
} else if(kmersize < gaps) {
/* mismatch or insersion */
gaps -= (kmersize - 1); /* adjust for consecutive k-mer mismatches */
@@ -1158,18 +1170,22 @@ int get_kmers_for_pair_pseoudoSparse(const HashMapKMA *templates, const Penaltie
/* evaluate best option */
if((W1 + (gaps - 1) * U) <= (mm * MM + m * M)) {
- Scores[template] += score + (mm * MM + m * M);
+ score = kmersize * M + (mm * MM + m * M);
} else {
- Scores[template] += score + (W1 + (gaps - 1) * U);
+ score = kmersize * M + (W1 + (gaps - 1) * U);
}
- } else {
- Scores[template] += (gaps + 1) * M + MM;
+ } else if(mlen != kmersize) {
+ score = gaps * M + MM;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ score = gaps * M + (kmersize - gaps) * U + W1;
}
+ Scores[template] += score;
} else {
- Scores[template] = score;
+ Scores[template] = kmersize * M;
if(include[template] == 0) {
include[template] = 1;
bests[++*bests] = template;
@@ -2248,12 +2264,17 @@ int save_kmers_pseuodeSparse(const HashMapKMA *templates, const Penalties *rewar
++W1s;
Us += (gaps -1);
}
- } else {
- Ms += (gaps - 1);
+ } else if(mlen != kmersize) {
+ Ms += gaps;
++MMs;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ Ms += gaps;
+ ++W1s;
+ Us += (kmersize - gaps);
}
HIT = j;
@@ -2270,20 +2291,19 @@ int save_kmers_pseuodeSparse(const HashMapKMA *templates, const Penalties *rewar
}
HIT = j - 1;
last = values;
- score = kmersize * M;
values_s = (short unsigned *) values;
n = SU ? *values_s : *values;
l = n + 1;
while(--l) {
template = SU ? *++values_s : *++values;
- if(Score[template] != 0) {
+ if(include[template]) {
gaps = HIT - extendScore[template];
if(gaps == 0) {
/* match */
- Score[template] += M;
+ score = M;
} else if(gaps == kmersize) {
/* snp */
- Score[template] += score + MM;
+ score = kmersize * M + MM;
} else if(kmersize < gaps) {
/* mismatch or insersion */
gaps -= (kmersize - 1); /* adjust for consecutive k-mer mismatches */
@@ -2299,18 +2319,22 @@ int save_kmers_pseuodeSparse(const HashMapKMA *templates, const Penalties *rewar
/* evaluate best option */
if((W1 + (gaps - 1) * U) <= (mm * MM + m * M)) {
- Score[template] += score + (mm * MM + m * M);
+ score = kmersize * M + (mm * MM + m * M);
} else {
- Score[template] += score + (W1 + (gaps - 1) * U);
+ score = kmersize * M + (W1 + (gaps - 1) * U);
}
- } else {
- Score[template] += (gaps + 1) * M + MM;
+ } else if(mlen != kmersize) {
+ score = gaps * M + MM;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ score = gaps * M + (kmersize - gaps) * U + W1;
}
+ Score[template] += score;
} else {
- Score[template] = score;
+ Score[template] = kmersize * M;
if(include[template] == 0) {
include[template] = 1;
bestTemplates[++*bestTemplates] = template;
@@ -2419,8 +2443,8 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
int i, j, j_u, l, end, HIT, gaps, score, Ms, MMs, Us, W1s, W1, U, M, MM;
int template, hitCounter, bestScore, bestScore_r, kmersize, mPos, hLen;
- int seqend, m, mm;
- unsigned *values, *last, n, SU, shifter, cPos, iPos, mlen, flag;
+ int seqend, m, mm, mlen;
+ unsigned *values, *last, n, SU, shifter, cPos, iPos, flag;
short unsigned *values_s;
long unsigned mask, mmask, kmer, cmer, hmer, *seq;
char *include;
@@ -2529,14 +2553,19 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
Ms += m;
} else {
++W1s;
- Us += (gaps -1);
+ Us += (gaps - 1);
}
- } else {
- Ms += (gaps - 1);
+ } else if(mlen != kmersize) {
+ Ms += gaps;
++MMs;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ Ms += gaps;
+ ++W1s;
+ Us += (kmersize - gaps);
}
HIT = j;
@@ -2553,20 +2582,19 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
}
HIT = j - 1;
last = values;
- score = kmersize * M;
values_s = (short unsigned *) values;
n = SU ? *values_s : *values;
l = n + 1;
while(--l) {
template = SU ? *++values_s : *++values;
- if(Score[template] != 0) {
+ if(include[template]) {
gaps = HIT - extendScore[template];
if(gaps == 0) {
/* match */
- Score[template] += M;
- } else if(gaps == kmersize) {
+ score = M;
+ } else if(mlen <= gaps && gaps <= kmersize) {
/* snp */
- Score[template] += score + MM;
+ score = gaps * M + MM;
} else if(kmersize < gaps) {
/* mismatch or insersion */
gaps -= (kmersize - 1); /* adjust for consecutive k-mer mismatches */
@@ -2582,18 +2610,43 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
/* evaluate best option */
if((W1 + (gaps - 1) * U) <= (mm * MM + m * M)) {
- Score[template] += score + (mm * MM + m * M);
+ score = kmersize * M + (mm * MM + m * M);
} else {
- Score[template] += score + (W1 + (gaps - 1) * U);
+ score = kmersize * M + (W1 + (gaps - 1) * U);
}
- } else {
- Score[template] += (gaps + 1) * M + MM;
+ } else if(mlen != kmersize) {
+ score = gaps * M + MM;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ score = gaps * M + (kmersize - gaps) * U + W1;
}
+ Score[template] += score;
} else {
- Score[template] = score;
+ /* consider opening penalty */
+ /*
+ gaps = j;
+ if(gaps == 0) {
+ Score[template] = score;
+ } else {
+ if(gaps <= 2) {
+ mm = gaps;
+ m = 0;
+ } else {
+ mm = gaps / kmersize + (gaps % kmersize ? 1 : 0);
+ mm = MAX(2, mm);
+ m = MIN(gaps - mm, kmersize);
+ m = MIN(m, mm);
+ }
+ // evaluate best option
+ open = ((W1 + (gaps - 1) * U) < Wl) ? Wl : (W1 + (gaps - 1) * U);
+ open = open < (mm * MM + m * M) ? (mm * MM + m * M) : open;
+ Score[template] = score + open;
+ }
+ */
+ Score[template] = kmersize * M;
if(include[template] == 0) {
include[template] = 1;
bestTemplates[++*bestTemplates] = template;
@@ -2604,10 +2657,31 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
last = values;
values_s = (short unsigned *) values;
n = SU ? *values_s : *values;
- Ms = kmersize * M;
+ score = kmersize * M;
+
+ /* consider opening penalty */
+ /*
+ gaps = j;
+ if(gaps > 0) {
+ if(gaps <= 2) {
+ mm = gaps;
+ m = 0;
+ } else {
+ mm = gaps / kmersize + (gaps % kmersize ? 1 : 0);
+ mm = MAX(2, mm);
+ m = MIN(gaps - mm, kmersize);
+ m = MIN(m, mm);
+ }
+ // evaluate best option
+ open = ((W1 + (gaps - 1) * U) < Wl) ? Wl : (W1 + (gaps - 1) * U);
+ open = open < (mm * MM + m * M) ? (mm * MM + m * M) : open;
+ score += open;
+ }
+ */
+
for(l = 1; l <= n; ++l) {
template = SU ? *++values_s : *++values;
- Score[template] = Ms;
+ Score[template] = score;
include[template] = 1;
bestTemplates[l] = template;
}
@@ -2636,16 +2710,42 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
values_s = (short unsigned *) last;
l = (*values_s) + 1;
while(--l) {
- Score[values_s[l]] += score;
+ Score[(template = values_s[l])] += score;
+ extendScore[template] = HIT;
}
} else {
l = (*last) + 1;
while(--l) {
- Score[last[l]] += score;
+ Score[(template = last[l])] += score;
+ extendScore[template] = HIT;
}
}
+
for(l = *bestTemplates; l != 0; --l) {
- extendScore[(template = bestTemplates[l])] = 0;
+ template = bestTemplates[l];
+ /* consider closing penalties */
+ /*
+ HIT = qseq->seqlen - kmersize;
+ gaps = HIT - extendScore[template];
+ if(gaps > 0) {
+ if(gaps <= 2) {
+ mm = gaps;
+ m = 0;
+ } else {
+ mm = gaps / kmersize + (gaps % kmersize ? 1 : 0);
+ mm = MAX(2, mm);
+ m = MIN(gaps - mm, kmersize);
+ m = MIN(m, mm);
+ }
+ // evaluate best option
+ open = ((W1 + (gaps - 1) * U) < Wl) ? Wl : (W1 + (gaps - 1) * U);
+ open = open < (mm * MM + m * M) ? (mm * MM + m * M) : open;
+ Score[template] += open;
+ }
+ */
+
+ /* clear up */
+ extendScore[template] = 0;
include[template] = 0;
if(Score[template] < 0) {
Score[template] = 0;
@@ -2764,14 +2864,19 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
Ms += m;
} else {
++W1s;
- Us += (gaps -1);
+ Us += (gaps - 1);
}
- } else {
- Ms += (gaps - 1);
+ } else if(mlen != kmersize) {
+ Ms += gaps;
++MMs;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ Ms += gaps;
+ ++W1s;
+ Us += (kmersize - gaps);
}
HIT = j;
@@ -2788,20 +2893,19 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
}
HIT = j - 1;
last = values;
- score = kmersize * M;
values_s = (short unsigned *) values;
n = SU ? *values_s : *values;
l = n + 1;
while(--l) {
template = SU ? *++values_s : *++values;
- if(Score_r[template] != 0) {
+ if(include[template]) {
gaps = HIT - extendScore[template];
if(gaps == 0) {
/* match */
- Score_r[template] += M;
+ score = M;
} else if(gaps == kmersize) {
/* snp */
- Score_r[template] += score + MM;
+ score = kmersize * M + MM;
} else if(kmersize < gaps) {
/* mismatch or insersion */
gaps -= (kmersize - 1); /* adjust for consecutive k-mer mismatches */
@@ -2817,18 +2921,22 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
/* evaluate best option */
if((W1 + (gaps - 1) * U) <= (mm * MM + m * M)) {
- Score_r[template] += score + (mm * MM + m * M);
+ score = kmersize * M + (mm * MM + m * M);
} else {
- Score_r[template] += score + (W1 + (gaps - 1) * U);
+ score = kmersize * M + (W1 + (gaps - 1) * U);
}
- } else {
- Score_r[template] += (gaps + 1) * M + MM;
+ } else if(mlen != kmersize) {
+ score = gaps * M + MM;
/* unlikely deletion or random k-mer mismatch,
unless minimizers or homopolymer compression,
assume better and go random zero score else. */
+ } else {
+ /* indel */
+ score = gaps * M + (kmersize - gaps) * U + W1;
}
+ Score_r[template] += score;
} else {
- Score_r[template] = score;
+ Score_r[template] = kmersize * M;
if(include[template] == 0) {
include[template] = 1;
bestTemplates_r[++*bestTemplates_r] = template;
@@ -2839,10 +2947,10 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
last = values;
values_s = (short unsigned *) values;
n = SU ? *values_s : *values;
- Ms = kmersize * M;
+ score = kmersize * M;
for(l = 1; l <= n; ++l) {
template = SU ? *++values_s : *++values;
- Score_r[template] = Ms;
+ Score_r[template] = score;
include[template] = 1;
bestTemplates_r[l] = template;
}
@@ -2879,8 +2987,12 @@ int save_kmers(const HashMapKMA *templates, const Penalties *rewards, int *bestT
Score_r[last[l]] += score;
}
}
+
for(l = *bestTemplates_r; l != 0; --l) {
- extendScore[(template = bestTemplates_r[l])] = 0;
+ template = bestTemplates_r[l];
+
+ /* clear up */
+ extendScore[template] = 0;
include[template] = 0;
if(Score_r[template] < 0) {
Score_r[template] = 0;
@@ -5423,6 +5535,8 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
if(gaps == -kmersize) {
/* Perfect match */
score += V_score->weight - (kmersize - 1) * M;
+ } else if(gaps == 0) {
+ score += V_score->weight + MM;
} else if(0 < gaps) {
/* mismatch or insersion */
if(gaps <= 2) {
@@ -5441,9 +5555,12 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
} else {
score += V_score->weight + (W1 + (gaps - 1) * U);
}
- } else {
+ } else if(mlen != kmersize) {
/* snp */
- score += V_score->weight + (gaps + 1) * M + MM;
+ score += V_score->weight + gaps * M + MM;
+ } else {
+ /* indel */
+ score += V_score->weight + gaps * M - (gaps + 1) * U + W1;
}
/* verify extension */
@@ -5559,22 +5676,22 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
cStart_r = -1;
if(!best_score->score || !best_score_r->score) {
if(best_score->score) {
- tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates, Score, extendScore, include);
+ tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates, Score, extendScore, include);
cStart = tmp_score->start;
start = cStart;
len = best_score->end - start;
rc = 1;
} else {
- tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates_r, Score, extendScore, include);
+ tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates_r, Score, extendScore, include);
cStart_r = tmp_score->start;
start = cStart_r;
len = best_score_r->end - start;
rc = 2;
}
} else {
- tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates, Score, extendScore, include);
+ tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates, Score, extendScore, include);
cStart = tmp_score->start;
- tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates_r, Score, extendScore, include);
+ tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates_r, Score, extendScore, include);
cStart_r = tmp_score->start;;
rc = chooseChain(best_score, best_score_r, cStart, cStart_r, &start, &len);
}
@@ -5615,7 +5732,7 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
/* update best template candidates */
template = *bests;
*bests = 0;
- getChainTemplates(V_score, rewards, template_lengths, qseq->seqlen, kmersize, bests, Score, extendScore, include);
+ getChainTemplates(V_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bests, Score, extendScore, include);
/* update bests */
*bestTemplates += *bests;
*bests = template;
@@ -5655,7 +5772,7 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
/* update best template candidates */
template = *bests;
*bests = 0;
- getChainTemplates(V_score, rewards, template_lengths, qseq->seqlen, kmersize, bests, Score, extendScore, include);
+ getChainTemplates(V_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bests, Score, extendScore, include);
/* update bests */
*bestTemplates_r += *bests;
*bests = template;
@@ -5729,7 +5846,7 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
ties = 0;
rc = 0;
if(best_score) {
- if(best_score->score && (tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates, Score, extendScore, include))) {
+ if(best_score->score && (tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates, Score, extendScore, include))) {
cStart = tmp_score->start;
cover = queSeqmentTree(chainSeqments->root, cStart, best_score->end);
@@ -5748,7 +5865,7 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
while(best_score && best_score->score == 0) {
/* find best anker */
if((best_score = getBestAnker(&VF_scores, &ties, template_lengths))) {
- if(kmersize < best_score->score && (tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates, Score, extendScore, include))) {
+ if(kmersize < best_score->score && (tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates, Score, extendScore, include))) {
/* check chain */
cStart = tmp_score->start;
cover = queSeqmentTree(chainSeqments->root, cStart, best_score->end);
@@ -5771,7 +5888,7 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
}
if(best_score_r) {
- if(best_score_r->score && (tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates_r, Score, extendScore, include))) {
+ if(best_score_r->score && (tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates_r, Score, extendScore, include))) {
cStart_r = tmp_score->start;
cover = queSeqmentTree(chainSeqments->root, cStart_r, best_score_r->end);
//cover = queSeqmentTree(chainSeqments->root, qseq->seqlen - best_score_r->end, qseq->seqlen - cStart_r);
@@ -5791,7 +5908,7 @@ int save_kmers_chain(const HashMapKMA *templates, const Penalties *rewards, int
while(best_score_r && best_score_r->score == 0) {
/* find best anker */
if((best_score_r = getBestAnker(&VR_scores, &ties, template_lengths))) {
- if(kmersize < best_score_r->score && (tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates_r, Score, extendScore, include))) {
+ if(kmersize < best_score_r->score && (tmp_score = getChainTemplates(best_score_r, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates_r, Score, extendScore, include))) {
/* check chain */
cStart_r = tmp_score->start;
cover = queSeqmentTree(chainSeqments->root, cStart_r, best_score_r->end);
@@ -6124,7 +6241,7 @@ int save_kmers_sparse_chain(const HashMapKMA *templates, const Penalties *reward
/* snp */
Ms += kmersize;
++MMs;
- } else {//if(kmersize < gaps) {
+ } else {
if(last) {
/* update and link between ankers */
V_score->weight = Ms * M + MMs * MM + Us * U + W1s * W1;
@@ -6141,10 +6258,7 @@ int save_kmers_sparse_chain(const HashMapKMA *templates, const Penalties *reward
Us = 0;
W1s = 0;
++hitCounter;
- } /*else {
- // unlikely deletion or random k-mer mismatch,
- // assume better and go random zero score
- }*/
+ }
} else {
if(last) {
/* update and link between ankers */
@@ -6258,9 +6372,12 @@ int save_kmers_sparse_chain(const HashMapKMA *templates, const Penalties *reward
} else {
score += V_score->weight + (W1 + (gaps - 1) * U);
}
- } else {
+ } else if(mlen != kmersize) {
/* snp */
- score += V_score->weight + (gaps + 1) * M + MM;
+ score += V_score->weight + gaps * M + MM;
+ } else {
+ /* indel */
+ score += V_score->weight + gaps * M - (gaps + 1) * U + W1;
}
/* verify extension */
@@ -6357,7 +6474,7 @@ int save_kmers_sparse_chain(const HashMapKMA *templates, const Penalties *reward
and zero out ankers */
*include = SU;
*bestTemplates = 0;
- tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates, Score, extendScore, include);
+ tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates, Score, extendScore, include);
score = best_score->score;
start = tmp_score->start;
len = best_score->end - start;
@@ -6401,7 +6518,7 @@ int save_kmers_sparse_chain(const HashMapKMA *templates, const Penalties *reward
/* update best template candidates */
template = *bests;
*bests = 0;
- getChainTemplates(V_score, rewards, template_lengths, qseq->seqlen, kmersize, bests, Score, extendScore, include);
+ getChainTemplates(V_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bests, Score, extendScore, include);
/* update */
*bestTemplates += *bests;
*bests = template;
@@ -6442,7 +6559,7 @@ int save_kmers_sparse_chain(const HashMapKMA *templates, const Penalties *reward
while(best_score->score == 0) {
/* find best anker */
if((best_score = getBestAnker(&VF_scores, &ties, template_lengths))) {
- if(kmersize < best_score->score && (tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, bestTemplates, Score, extendScore, include))) {
+ if(kmersize < best_score->score && (tmp_score = getChainTemplates(best_score, rewards, template_lengths, qseq->seqlen, kmersize, mlen, bestTemplates, Score, extendScore, include))) {
/* check chain */
start = tmp_score->start;
cover = queSeqmentTree(chainSeqments->root, start, best_score->end);
=====================================
seqparse.c
=====================================
@@ -38,6 +38,7 @@ int openAndDetermine(FileBuff *inputfile, char *filename) {
} else {
openFileBuff(inputfile, filename, "rb");
}
+ inputfile->buffer[0] = 0;
if(buff_FileBuff(inputfile)) {
check = (short unsigned *) inputfile->buffer;
if(*check == 35615) {
@@ -47,14 +48,13 @@ int openAndDetermine(FileBuff *inputfile, char *filename) {
} else {
buffFileBuff = &buff_FileBuff;
}
- } else {
- inputfile->buffer[0] = 0;
}
-
if(inputfile->buffer[0] == '@') { //FASTQ
FASTQ |= 1;
} else if(inputfile->buffer[0] == '>') { //FASTA
FASTQ |= 2;
+ } else if(inputfile->buffer[0] == 0) {
+ fprintf(stderr, "Empty file:\t%s\n", filename);
} else {
fprintf(stderr, "Cannot determine format of file:\t%s\n", filename);
errno |= 1;
=====================================
sparse.c
=====================================
@@ -241,7 +241,7 @@ void save_kmers_sparse(const HashMapKMA *templates, HashMap_kmers *foundKmers, C
}
}
-void run_input_sparse(const HashMapKMA *templates, char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int kmersize, char *trans, const double *prob, FILE *out) {
+void run_input_sparse(const HashMapKMA *templates, char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int kmersize, char *trans, const double *prob, FILE *out) {
int FASTQ, fileCounter, phredScale, phredCut, start, end;
char *filename;
@@ -289,7 +289,7 @@ void run_input_sparse(const HashMapKMA *templates, char **inputfiles, int fileCo
qseq->len = end - start;
/* print */
- if(qseq->len > kmersize && minQ <= eQual(seq + start, qseq->len, minQ, prob - phredScale)) {
+ if(qseq->len > kmersize && qseq->len != hardmaskQ(qseq->seq + start, seq + start, qseq->len, phredScale, minmaskQ) && minQ <= eQual(seq + start, qseq->len, minQ, prob - phredScale)) {
/* translate to kmers */
Kmers->n = translateToKmersAndDump(Kmers->kmers, Kmers->n, Kmers->size, qseq->seq + start, qseq->len, templates, out);
}
@@ -588,13 +588,13 @@ int save_kmers_sparse_batch(char *templatefilename, char *outputfilename, char *
score_tot_add += node->key;
if(prev == 0) {
deConTable = node->next;
- free(node);
free(node->value);
+ free(node);
node = deConTable;
} else {
prev->next = node->next;
- free(node);
free(node->value);
+ free(node);
node = prev->next;
}
} else {
=====================================
sparse.h
=====================================
@@ -24,5 +24,5 @@
int translateToKmersAndDump(long unsigned *Kmers, int n, int max, unsigned char *qseq, int seqlen, const HashMapKMA *templates, FILE *out);
char ** load_DBs_Sparse(char *templatefilename, unsigned **template_lengths, unsigned **template_ulengths, unsigned shm);
void save_kmers_sparse(const HashMapKMA *templates, HashMap_kmers *foundKmers, CompKmers *compressor);
-void run_input_sparse(const HashMapKMA *templates, char **inputfiles, int fileCount, int minPhred, int minQ, int fiveClip, int threeClip, int kmersize, char *trans, const double *prob, FILE *out);
+void run_input_sparse(const HashMapKMA *templates, char **inputfiles, int fileCount, int minPhred, int minmaskQ, int minQ, int fiveClip, int threeClip, int kmersize, char *trans, const double *prob, FILE *out);
int save_kmers_sparse_batch(char *templatefilename, char *outputfilename, char *exePrev, double ID_t, double Depth_t, double evalue, char ss, unsigned shm);
=====================================
spltdb.c
=====================================
@@ -739,9 +739,6 @@ int runKMA_spltDB(char **templatefilenames, int targetNum, char *outputfilename,
--uPtr;
--i;
} else {
- if(targetInfo[i] == 0) {
- *uPtr = get_ankers_spltDB(targetInfo[i], matched_templates, qseq_comp, header, &flag, inputfiles[i]);
- }
*uPtr = get_ankers_spltDB(targetInfo[i], matched_templates, qseq_comp, header, &flag, inputfiles[i]);
--uPtr;
--i;
=====================================
stdstat.h
=====================================
@@ -20,6 +20,7 @@
#define MIN(X, Y) ((X < Y) ? X : Y)
#define MAX(X, Y) ((X < Y) ? Y : X)
#define murmur(index, kmer) index = (3323198485ul ^ kmer) * 0x5bd1e995; index ^= index >> 15;
+#define murmur3(index, kmer) index = kmer * 0xcc9e2d51; index = (index << 15) | (index >> 17); index *= 0x1b873593; index = (index << 13) | (index >> 19); index ^= index >> 16; index *= 0x85ebca6b; index ^= index >> 13; index *= 0xc2b2ae35; index ^= index >> 16;
extern int (*cmp)(int, int);
int cmp_or(int t, int q);
=====================================
trim.c
=====================================
@@ -136,6 +136,7 @@ static int helpMessage(FILE *out) {
fprintf(out, "# %16s\t%-32s\t%s\n", "-ml", "Minimum length", "16");
fprintf(out, "# %16s\t%-32s\t%s\n", "-xl", "Maximum length", "2147483647");
fprintf(out, "# %16s\t%-32s\t%s\n", "-mp", "Minimum phred", "20");
+ fprintf(out, "# %16s\t%-32s\t%s\n", "-mi", "Minimum internal phred score", "0");
fprintf(out, "# %16s\t%-32s\t%s\n", "-eq", "Minimum average quality", "0");
fprintf(out, "# %16s\t%-32s\t%s\n", "-5p", "Trim 5 prime", "0");
fprintf(out, "# %16s\t%-32s\t%s\n", "-3p", "Trim 3 prime", "0");
@@ -163,8 +164,8 @@ int trim_main(int argc, char *argv[]) {
0.0000000000398107, 0.0000000000316228, 0.0000000000251189, 0.0000000000199526, 0.0000000000158489, 0.0000000000125893, 0.0000000000100000, 0.0000000000079433,
0.0000000000063096, 0.0000000000050119, 0.0000000000039811, 0.0000000000031623, 0.0000000000025119, 0.0000000000019953, 0.0000000000015849, 0.0000000000012589,
0.0000000000010000, 0.0000000000007943, 0.0000000000006310, 0.0000000000005012, 0.0000000000003981, 0.0000000000003162, 0.0000000000002512, 0.0000000000001995};
- int i, args, minPhred, minQ, fiveClip, threeClip, minlen, maxlen;
- int fileCounter, fileCounter_PE, fileCounter_INT, fileCount;
+ int i, args, fileCounter, fileCounter_PE, fileCounter_INT, fileCount;
+ int minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, maxlen;
long unsigned totFrags;
char **inputfiles, **inputfiles_PE, **inputfiles_INT;
char *to2Bit, *outputfilename, *outputfilename_int, *exeBasic;
@@ -172,6 +173,7 @@ int trim_main(int argc, char *argv[]) {
/* set defaults */
minPhred = 20;
+ minmaskQ = 0;
minQ = 0;
fiveClip = 0;
threeClip = 0;
@@ -291,6 +293,15 @@ int trim_main(int argc, char *argv[]) {
exit(1);
}
}
+ } else if(strcmp(argv[args], "-mi") == 0) {
+ ++args;
+ if(args < argc) {
+ minmaskQ = strtoul(argv[args], &exeBasic, 10);
+ if(*exeBasic != 0) {
+ fprintf(stderr, "# Invalid internal minimum phred score parsed\n");
+ exit(1);
+ }
+ }
} else if(strcmp(argv[args], "-eq") == 0) {
++args;
if(args < argc) {
@@ -389,20 +400,23 @@ int trim_main(int argc, char *argv[]) {
inputfiles = smalloc(sizeof(char *));
*inputfiles = "--";
}
+ if(minPhred < minmaskQ) {
+ minPhred = minmaskQ;
+ }
/* SE */
if(fileCounter > 0) {
- totFrags += run_input(inputfiles, fileCounter, minPhred, minQ, fiveClip, threeClip, minlen, maxlen, to2Bit, prob, out);
+ totFrags += run_input(inputfiles, fileCounter, minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, maxlen, to2Bit, prob, out);
}
/* PE */
if(fileCounter_PE > 0) {
- totFrags += run_input_PE(inputfiles_PE, fileCounter_PE, minPhred, minQ, fiveClip, threeClip, minlen, to2Bit, prob, out);
+ totFrags += run_input_PE(inputfiles_PE, fileCounter_PE, minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, to2Bit, prob, out);
}
/* INT */
if(fileCounter_INT > 0) {
- totFrags += run_input_INT(inputfiles_INT, fileCounter_INT, minPhred, minQ, fiveClip, threeClip, minlen, to2Bit, prob, out);
+ totFrags += run_input_INT(inputfiles_INT, fileCounter_INT, minPhred, minmaskQ, minQ, fiveClip, threeClip, minlen, to2Bit, prob, out);
}
fprintf(stderr, "#\n# Total number of query fragment after trimming:\t%lu\n", totFrags);
=====================================
version.h
=====================================
@@ -17,4 +17,4 @@
* limitations under the License.
*/
-#define KMA_VERSION "1.4.11"
+#define KMA_VERSION "1.4.14"
View it on GitLab: https://salsa.debian.org/med-team/kma/-/commit/149fa09e1c71a0fcbc6851116e4ea40a00dcee2a
--
View it on GitLab: https://salsa.debian.org/med-team/kma/-/commit/149fa09e1c71a0fcbc6851116e4ea40a00dcee2a
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230823/0dc76e60/attachment-0001.htm>
More information about the debian-med-commit
mailing list