[med-svn] [Git][med-team/seqtk][master] 5 commits: New upstream version 1.4
Andreas Tille (@tille)
gitlab at salsa.debian.org
Thu Nov 9 06:04:45 GMT 2023
Andreas Tille pushed to branch master at Debian Med / seqtk
Commits:
52284c3c by Andreas Tille at 2023-11-08T21:32:33+01:00
New upstream version 1.4
- - - - -
18d68bec by Andreas Tille at 2023-11-08T21:32:33+01:00
routine-update: New upstream version
- - - - -
3bfea764 by Andreas Tille at 2023-11-08T21:32:33+01:00
Update upstream source from tag 'upstream/1.4'
Update to upstream version '1.4'
with Debian dir 0e67e73b55ab78be4cc1ef401e05d3dd4bde199c
- - - - -
4e334ce2 by Andreas Tille at 2023-11-08T21:32:33+01:00
routine-update: Standards-Version: 4.6.2
- - - - -
8d06ce37 by Andreas Tille at 2023-11-08T21:34:22+01:00
routine-update: Ready to upload to unstable
- - - - -
6 changed files:
- + NEWS.md
- README.md
- debian/changelog
- debian/control
- kseq.h
- seqtk.c
Changes:
=====================================
NEWS.md
=====================================
@@ -0,0 +1,26 @@
+Release 1.4-r122 (19 May 2023)
+------------------------------
+
+Notable changes:
+
+ * Improvement: faster FASTX parsing (#123)
+
+ * New feature: added the `telo` command to output telomere regions.
+
+ * New feature: added the `size` command to count the number of sequences and
+ the number of bases. Lighter and thus faster than `comp`.
+
+ * New feature: added the `hpc` command to compress homopolymers in input
+ sequences.
+
+ * New feature: added the `split` command to split a large input file into
+ multiple smaller files.
+
+ * New feature: added the `gap` command to output non-ACGT regions in the input
+ file.
+
+ * New feature: added option `-s` to command `subseq` to support the strand
+ field in BED. For the moment, this option does not work with other subseq
+ options.
+
+(1.4: 19 May 2023, r122)
=====================================
README.md
=====================================
@@ -59,3 +59,6 @@ Seqtk Examples
seqtk trimfq -b 5 -e 10 in.fa > out.fa
+* Find telomere (TTAGGG)n repeats:
+
+ seqtk telo seq.fa > telo.bed 2> telo.count
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+seqtk (1.4-1) unstable; urgency=medium
+
+ * New upstream version
+ * Standards-Version: 4.6.2 (routine-update)
+
+ -- Andreas Tille <tille at debian.org> Wed, 08 Nov 2023 21:33:35 +0100
+
seqtk (1.3-4) unstable; urgency=medium
* Fix watch file
=====================================
debian/control
=====================================
@@ -6,7 +6,7 @@ Section: science
Priority: optional
Build-Depends: debhelper-compat (= 13),
zlib1g-dev
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
Vcs-Browser: https://salsa.debian.org/med-team/seqtk
Vcs-Git: https://salsa.debian.org/med-team/seqtk.git
Homepage: https://github.com/lh3/seqtk
=====================================
kseq.h
=====================================
@@ -108,8 +108,8 @@ typedef struct __kstring_t {
} else break; \
} \
if (delimiter == KS_SEP_LINE) { \
- for (i = ks->begin; i < ks->end; ++i) \
- if (ks->buf[i] == '\n') break; \
+ unsigned char *sep = (unsigned char*)memchr(ks->buf + ks->begin, '\n', ks->end - ks->begin); \
+ i = sep != NULL ? sep - ks->buf : ks->end; \
} else if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
=====================================
seqtk.c
=====================================
@@ -1,6 +1,7 @@
/* The MIT License
- Copyright (c) 2008-2016 Broad Institute
+ Copyright (c) 2018- Dana-Farber Cancer Institute
+ 2008-2018 Broad Institute
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@@ -41,6 +42,7 @@ KSEQ_INIT(gzFile, gzread)
typedef struct {
int n, m;
uint64_t *a;
+ int8_t *rev;
} reglist_t;
#include "khash.h"
@@ -49,6 +51,65 @@ KHASH_SET_INIT_INT64(64)
typedef kh_reg_t reghash_t;
+reghash_t *stk_reg_read_alt(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t str = {0,0,0};
+
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) return 0;
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
+ int i, c, st = -1, en = -1, rev = 0;
+ char *p, *q;
+ reglist_t *r;
+ khint_t k;
+ for (i = 0, p = q = str.s;; ++p) {
+ if (*p == '\t' || *p == '\0') {
+ c = *p;
+ *p = 0;
+ if (i == 1) {
+ if (isdigit(*q)) st = strtol(q, &q, 10);
+ if (q != p) st = -1;
+ } else if (i == 2) {
+ if (isdigit(*q)) en = strtol(q, &q, 10);
+ if (q != p) en = -1;
+ } else if (i == 5) {
+ if (*q == '+') rev = 1;
+ else if (*q == '-') rev = -1;
+ }
+ ++i, q = p + 1;
+ if (c == 0) break;
+ }
+ }
+ if (i == 0) continue;
+ k = kh_get(reg, h, str.s);
+ if (k == kh_end(h)) {
+ int ret;
+ char *s = strdup(str.s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(reglist_t));
+ }
+ r = &kh_val(h, k);
+ if (en < 0 && st > 0) en = st, st = st - 1; // if there is only one column
+ if (st < 0) st = 0, en = INT_MAX;
+ if (r->n == r->m) {
+ r->m = r->m? r->m<<1 : 4;
+ r->a = (uint64_t*)realloc(r->a, r->m * 8);
+ r->rev = (int8_t*)realloc(r->rev, r->m);
+ }
+ r->a[r->n] = (uint64_t)st<<32 | en;
+ r->rev[r->n++] = rev;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str.s);
+ return h;
+}
+
reghash_t *stk_reg_read(const char *fn)
{
reghash_t *h = kh_init(reg);
@@ -106,6 +167,7 @@ void stk_reg_destroy(reghash_t *h)
for (k = 0; k < kh_end(h); ++k) {
if (kh_exist(h, k)) {
free(kh_val(h, k).a);
+ free(kh_val(h, k).rev);
free((char*)kh_key(h, k));
}
}
@@ -168,42 +230,43 @@ char comp_tab[] = {
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
};
-static void stk_printstr(const kstring_t *s, unsigned line_len)
+static void stk_printstr(FILE *fp, const kstring_t *s, unsigned line_len)
{
if (line_len != UINT_MAX && line_len != 0) {
int i, rest = s->l;
for (i = 0; i < s->l; i += line_len, rest -= line_len) {
- putchar('\n');
- if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout);
- else fwrite(s->s + i, 1, rest, stdout);
+ fputc('\n', fp);
+ if (rest > line_len) fwrite(s->s + i, 1, line_len, fp);
+ else fwrite(s->s + i, 1, rest, fp);
}
- putchar('\n');
+ fputc('\n', fp);
} else {
- putchar('\n');
- puts(s->s);
+ fputc('\n', fp);
+ fputs(s->s, fp);
+ fputc('\n', fp);
}
}
-static inline void stk_printseq_renamed(const kseq_t *s, int line_len, const char *prefix, int64_t n)
+static inline void stk_printseq_renamed(FILE *fp, const kseq_t *s, int line_len, const char *prefix, int64_t n)
{
- putchar(s->qual.l? '@' : '>');
+ fputc(s->qual.l? '@' : '>', fp);
if (n >= 0) {
- if (prefix) fputs(prefix, stdout);
- printf("%lld", (long long)n);
- } else fputs(s->name.s, stdout);
+ if (prefix) fputs(prefix, fp);
+ fprintf(fp, "%lld", (long long)n);
+ } else fputs(s->name.s, fp);
if (s->comment.l) {
- putchar(' '); fputs(s->comment.s, stdout);
+ fputc(' ', fp); fputs(s->comment.s, fp);
}
- stk_printstr(&s->seq, line_len);
+ stk_printstr(fp, &s->seq, line_len);
if (s->qual.l) {
- putchar('+');
- stk_printstr(&s->qual, line_len);
+ fputc('+', fp);
+ stk_printstr(fp, &s->qual, line_len);
}
}
-static inline void stk_printseq(const kseq_t *s, int line_len)
+static inline void stk_printseq(FILE *fp, const kseq_t *s, int line_len)
{
- stk_printseq_renamed(s, line_len, 0, -1);
+ stk_printseq_renamed(fp, s, line_len, 0, -1);
}
/*
@@ -396,9 +459,9 @@ int stk_comp(int argc, char *argv[])
}
for (k = 0; p && k < p->n; ++k) {
int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
- int la, lb, lc, na, nb, nc, cnt[11];
- if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
- else la = 'a', lb = -1, lc = 0;
+ int la, lb, na, nb, nc, cnt[11];
+ if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la];
+ else la = 'a', lb = -1;
na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
memset(cnt, 0, 11 * sizeof(int));
for (i = beg; i < end; ++i) {
@@ -422,7 +485,7 @@ int stk_comp(int argc, char *argv[])
if (b == 10 || b == 5) ++cnt[10];
}
}
- la = a; lb = b; lc = c;
+ la = a; lb = b;
}
if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
else printf("%s\t%d", seq->name.s, l);
@@ -541,6 +604,42 @@ int stk_hety(int argc, char *argv[])
return 0;
}
+int stk_gap(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int len, c, min_size = 50;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: seqtk gap [-l %d] <in.fa>\n", min_size);
+ return 1;
+ }
+ while ((c = getopt(argc, argv, "l:")) >= 0) {
+ switch (c) {
+ case 'l': min_size = atoi(optarg); break;
+ }
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+ if (fp == 0) {
+ fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+ return 1;
+ }
+ seq = kseq_init(fp);
+ while ((len = kseq_read(seq)) >= 0) {
+ int i, l;
+ for (i = l = 0; i <= len; ++i) {
+ c = i < len? seq_nt6_table[(uint8_t)seq->seq.s[i]] : 5;
+ if (i == len || (c >= 1 && c <= 4)) {
+ if (l > 0 && l >= min_size)
+ printf("%s\t%d\t%d\n", seq->name.s, i - l, i);
+ l = 0;
+ } else ++l;
+ }
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
/* subseq */
int stk_subseq(int argc, char *argv[])
@@ -548,23 +647,28 @@ int stk_subseq(int argc, char *argv[])
khash_t(reg) *h = kh_init(reg);
gzFile fp;
kseq_t *seq;
- int l, i, j, c, is_tab = 0, line = 0;
+ int l, i, j, c, is_tab = 0, line = 0, do_strand = 0;
+ char *seq_buf = 0;
+ uint32_t seq_max = 0;
khint_t k;
- while ((c = getopt(argc, argv, "tl:")) >= 0) {
+ while ((c = getopt(argc, argv, "tl:s")) >= 0) {
switch (c) {
case 't': is_tab = 1; break;
+ case 's': do_strand = 1; break;
case 'l': line = atoi(optarg); break;
}
}
if (optind + 2 > argc) {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n\n");
- fprintf(stderr, "Options: -t TAB delimited output\n");
- fprintf(stderr, " -l INT sequence line length [%d]\n\n", line);
- fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n");
+ fprintf(stderr, "Usage: seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -t TAB delimited output\n");
+ fprintf(stderr, " -s strand aware\n");
+ fprintf(stderr, " -l INT sequence line length [%d]\n", line);
+ fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n");
return 1;
}
- h = stk_reg_read(argv[optind+1]);
+ if (do_strand) h = stk_reg_read_alt(argv[optind+1]);
+ else h = stk_reg_read(argv[optind+1]);
if (h == 0) {
fprintf(stderr, "[E::%s] failed to read the list of regions in file '%s'\n", __func__, argv[optind+1]);
return 1;
@@ -598,21 +702,37 @@ int stk_subseq(int argc, char *argv[])
if (seq->comment.l) printf(" %s", seq->comment.s);
} else printf("%s\t%d\t", seq->name.s, beg + 1);
if (end > seq->seq.l) end = seq->seq.l;
- for (j = 0; j < end - beg; ++j) {
- if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n');
- putchar(seq->seq.s[j + beg]);
- }
- putchar('\n');
- if (seq->qual.l != seq->seq.l || is_tab) continue;
- printf("+");
- for (j = 0; j < end - beg; ++j) {
- if (j == 0 || (line > 0 && j % line == 0)) putchar('\n');
- putchar(seq->qual.s[j + beg]);
+ if (do_strand && p->rev) { // TODO: the strand mode only works with FASTA
+ if (end - beg >= seq_max) {
+ seq_max = end - beg;
+ kroundup32(seq_max);
+ seq_buf = (char*)realloc(seq_buf, seq_max);
+ }
+ if (p->rev[i] < 0)
+ for (j = end - 1; j >= beg; --j)
+ seq_buf[end - 1 - j] = (uint8_t)seq->seq.s[j] >= 128? 'N' : comp_tab[(uint8_t)seq->seq.s[j]];
+ else memcpy(seq_buf, &seq->seq.s[beg], end - beg);
+ putchar('\n');
+ fwrite(seq_buf, 1, end - beg, stdout);
+ putchar('\n');
+ } else {
+ for (j = 0; j < end - beg; ++j) {
+ if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n');
+ putchar(seq->seq.s[j + beg]);
+ }
+ putchar('\n');
+ if (seq->qual.l != seq->seq.l || is_tab) continue;
+ printf("+");
+ for (j = 0; j < end - beg; ++j) {
+ if (j == 0 || (line > 0 && j % line == 0)) putchar('\n');
+ putchar(seq->qual.s[j + beg]);
+ }
+ putchar('\n');
}
- putchar('\n');
}
}
// free
+ free(seq_buf);
kseq_destroy(seq);
gzclose(fp);
stk_reg_destroy(h);
@@ -876,6 +996,54 @@ int stk_listhet(int argc, char *argv[])
return 0;
}
+int stk_split(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ int c, i, l, n = 10, len = 0;
+ char *prefix, *fn;
+ FILE **out;
+ while ((c = getopt(argc, argv, "n:l:")) >= 0) {
+ if (c == 'n') n = atoi(optarg);
+ else if (c == 'l') len = atoi(optarg);
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: seqtk split [options] <prefix> <in.fa>\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -n INT number of files [%d]\n", n);
+ fprintf(stderr, " -l INT line length [%d]\n", len);
+ return 1;
+ }
+ prefix = argv[optind];
+ fp = (strcmp(argv[optind+1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind+1], "r");
+ if (fp == 0) {
+ fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+ return 1;
+ }
+ out = (FILE**)malloc(sizeof(FILE*) * n);
+ fn = (char*)malloc(strlen(prefix) + 10);
+ for (i = 0; i < n; ++i) {
+ sprintf(fn, "%s.%.5d.fa", prefix, i + 1);
+ out[i] = fopen(fn, "w+");
+ if (out[i] == 0) {
+ fprintf(stderr, "ERROR: failed to create file %s\n", fn);
+ exit(1);
+ }
+ }
+ free(fn);
+ seq = kseq_init(fp);
+ i = 0;
+ while ((l = kseq_read(seq)) >= 0) {
+ stk_printseq(out[i % n], seq, len);
+ ++i;
+ }
+ for (i = 0; i < n; ++i) fclose(out[i]);
+ free(out);
+ kseq_destroy(seq);
+ gzclose(fp);
+ return 0;
+}
+
/* cutN */
static int cutN_min_N_tract = 1000;
static int cutN_nonN_penalty = 10;
@@ -1072,11 +1240,11 @@ int stk_sample(int argc, char *argv[])
if (num) {
uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs);
if (y < num) cpy_kseq(&buf[y], seq);
- } else if (r < frac) stk_printseq(seq, UINT_MAX);
+ } else if (r < frac) stk_printseq(stdout, seq, UINT_MAX);
}
for (i = 0; i < num; ++i) {
kseq_t *p = &buf[i];
- if (p->seq.l) stk_printseq(p, UINT_MAX);
+ if (p->seq.l) stk_printseq(stdout, p, UINT_MAX);
free(p->seq.s); free(p->qual.s); free(p->name.s);
}
if (buf != NULL) free(buf);
@@ -1119,7 +1287,7 @@ int stk_sample(int argc, char *argv[])
n_seqs = 0;
while (kseq_read(seq) >= 0)
if (kh_get(64, hash, ++n_seqs) != kh_end(hash))
- stk_printseq(seq, UINT_MAX);
+ stk_printseq(stdout, seq, UINT_MAX);
kh_destroy(64, hash);
}
@@ -1186,7 +1354,7 @@ int stk_seq(int argc, char *argv[])
khash_t(reg) *h = 0;
krand_t *kr = 0;
- while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:SF:")) >= 0) {
+ while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:SF:x")) >= 0) {
switch (c) {
case 'a':
case 'A': flag |= 1; break;
@@ -1199,6 +1367,7 @@ int stk_seq(int argc, char *argv[])
case 'N': flag |= 128; break;
case 'U': flag |= 256; break;
case 'S': flag |= 512; break;
+ case 'x': flag |= 1024; break;
case 'M': h = stk_reg_read(optarg); break;
case 'n': mask_chr = *optarg; break;
case 'Q': qual_shift = atoi(optarg); break;
@@ -1234,6 +1403,7 @@ int stk_seq(int argc, char *argv[])
fprintf(stderr, " -2 output the 2n reads only\n");
fprintf(stderr, " -V shift quality by '(-Q) - 33'\n");
fprintf(stderr, " -U convert all bases to uppercases\n");
+ fprintf(stderr, " -x convert all lowercases to -n\n");
fprintf(stderr, " -S strip of white spaces in sequences\n");
fprintf(stderr, "\n");
free(kr);
@@ -1282,6 +1452,9 @@ int stk_seq(int argc, char *argv[])
if (flag & 256) // option -U: convert to uppercases
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = toupper(seq->seq.s[i]);
+ else if ((flag & 1024) && mask_chr > 0)
+ for (i = 0; i < seq->seq.l; ++i)
+ seq->seq.s[i] = islower(seq->seq.s[i])? mask_chr : seq->seq.s[i];
if (flag & 1) seq->qual.l = 0; // option -a: fastq -> fasta
else if (fake_qual >= 33 && fake_qual <= 127) {
if (seq->qual.m < seq->seq.m) {
@@ -1317,7 +1490,7 @@ int stk_seq(int argc, char *argv[])
if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break;
if (i < seq->seq.l) continue;
}
- stk_printseq(seq, line_len);
+ stk_printseq(stdout, seq, line_len);
}
kseq_destroy(seq);
gzclose(fp);
@@ -1409,8 +1582,8 @@ int stk_mergepe(int argc, char *argv[])
fprintf(stderr, "[W::%s] the 2nd file has fewer records.\n", __func__);
break;
}
- stk_printseq(seq[0], 0);
- stk_printseq(seq[1], 0);
+ stk_printseq(stdout, seq[0], 0);
+ stk_printseq(stdout, seq[1], 0);
}
if (kseq_read(seq[1]) >= 0)
fprintf(stderr, "[W::%s] the 1st file has fewer records.\n", __func__);
@@ -1445,8 +1618,8 @@ int stk_dropse(int argc, char *argv[])
is_diff = strncmp(p->s, q->s, l);
} else is_diff = 1;
if (!is_diff) {
- stk_printseq(&last, 0);
- stk_printseq(seq, 0);
+ stk_printseq(stdout, &last, 0);
+ stk_printseq(stdout, seq, 0);
last.name.l = 0;
} else cpy_kseq(&last, seq);
} else cpy_kseq(&last, seq);
@@ -1458,6 +1631,59 @@ int stk_dropse(int argc, char *argv[])
return 0;
}
+static inline int kputc(int c, kstring_t *s)
+{
+ if (s->l + 1 >= s->m) {
+ char *tmp;
+ s->m = s->l + 2;
+ kroundup32(s->m);
+ if ((tmp = (char*)realloc(s->s, s->m)))
+ s->s = tmp;
+ else
+ return EOF;
+ }
+ s->s[s->l++] = c;
+ s->s[s->l] = 0;
+ return c;
+}
+
+int stk_hpc(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ kstring_t str = {0,0,0};
+
+ if (argc == 1 && isatty(fileno(stdin))) {
+ fprintf(stderr, "Usage: seqtk hpc <in.fq>\n");
+ return 1;
+ }
+ fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+ return 1;
+ }
+ seq = kseq_init(fp);
+
+ while (kseq_read(seq) >= 0) {
+ int i, last;
+ str.l = 0;
+ if (seq->seq.l == 0) continue;
+ for (i = 1, last = 0; i <= seq->seq.l; ++i) {
+ if (i == seq->seq.l || seq->seq.s[last] != seq->seq.s[i]) {
+ kputc(seq->seq.s[last], &str);
+ last = i;
+ }
+ }
+ putchar('>'); puts(seq->name.s);
+ puts(str.s);
+ }
+
+ kseq_destroy(seq);
+ gzclose(fp);
+ free(str.s);
+ return 0;
+}
+
int stk_rename(int argc, char *argv[])
{
gzFile fp;
@@ -1487,18 +1713,18 @@ int stk_rename(int argc, char *argv[])
is_diff = strncmp(p->s, q->s, l);
} else is_diff = 1;
if (!is_diff) {
- stk_printseq_renamed(&last, 0, prefix, n);
- stk_printseq_renamed(seq, 0, prefix, n);
+ stk_printseq_renamed(stdout, &last, 0, prefix, n);
+ stk_printseq_renamed(stdout, seq, 0, prefix, n);
last.name.l = 0;
++n;
} else {
- stk_printseq_renamed(&last, 0, prefix, n);
+ stk_printseq_renamed(stdout, &last, 0, prefix, n);
++n;
cpy_kseq(&last, seq);
}
} else cpy_kseq(&last, seq);
}
- if (last.name.l) stk_printseq_renamed(&last, 0, prefix, n);
+ if (last.name.l) stk_printseq_renamed(stdout, &last, 0, prefix, n);
kseq_destroy(seq);
gzclose(fp);
@@ -1670,18 +1896,136 @@ int stk_fqchk(int argc, char *argv[])
return 0;
}
+int stk_size(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ uint64_t n = 0, l = 0;
+ if (argc == 1 && isatty(fileno(stdin))) {
+ fprintf(stderr, "Usage: seqtk size <in.fq>\n");
+ return 1;
+ }
+ fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+ return 1;
+ }
+ seq = kseq_init(fp);
+ while (kseq_read(seq) >= 0) {
+ ++n;
+ l += seq->seq.l;
+ }
+ kseq_destroy(seq);
+ gzclose(fp);
+ printf("%lld\t%lld\n", (long long)n, (long long)l);
+ return 0;
+}
+
+int stk_telo(int argc, char *argv[])
+{
+ gzFile fp;
+ kseq_t *seq;
+ char *telo_seq = "CCCTAA";
+ int c, i, j, len, absent, penalty = 1, max_drop = 2000, min_score = 300;
+ uint64_t x, mask, sum_input = 0, sum_telo = 0;
+ khash_t(64) *h;
+
+ while ((c = getopt(argc, argv, "m:p:d:s:")) >= 0) {
+ if (c == 'm') telo_seq = optarg;
+ else if (c == 'p') penalty = atoi(optarg);
+ else if (c == 'd') max_drop = atoi(optarg);
+ else if (c == 's') min_score = atoi(optarg);
+ }
+ if (penalty < 0) penalty = -penalty;
+ if (argc == optind && isatty(fileno(stdin))) {
+ fprintf(stderr, "Usage: seqtk telo [options] <in.fq>\n");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -m STR motif [%s]\n", telo_seq);
+ fprintf(stderr, " -p INT penalty [%d]\n", penalty);
+ fprintf(stderr, " -d INT max drop [%d]\n", max_drop);
+ fprintf(stderr, " -s INT min score [%d]\n", min_score);
+ return 1;
+ }
+
+ len = strlen(telo_seq);
+ mask = (1ULL<<2*len) - 1;
+
+ h = kh_init(64); // hash table for all roations of the telomere motif
+ kh_resize(64, h, len * 2);
+ for (i = 0; i < len; ++i) {
+ for (j = 0, x = 0; j < len; ++j) {
+ int c = seq_nt6_table[(uint8_t)telo_seq[(i + j) % len]];
+ assert(c >= 1 && c <= 4);
+ x = x<<2 | (c-1);
+ }
+ kh_put(64, h, x, &absent);
+ }
+
+ fp = argc > 1 && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) {
+ fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+ return 1;
+ }
+ seq = kseq_init(fp);
+ while (kseq_read(seq) >= 0) {
+ ssize_t i, l, max_i = -1, st = 0;
+ int64_t score, max;
+ sum_input += seq->seq.l;
+ score = max = 0, max_i = -1;
+ for (i = 0, l = 0, x = 0; i < seq->seq.l; ++i) { // 5'-end, check CCCTAA
+ int hit = 0, c = seq_nt6_table[(uint8_t)seq->seq.s[i]];
+ if (c >= 1 && c <= 4) { // not N
+ x = (x<<2 | (c-1)) & mask;
+ if (++l >= len && kh_get(64, h, x) != kh_end(h)) // x is at least 6bp long and is a telomere motif
+ hit = 1;
+ } else l = 0, x = 0; // N, ambiguous base
+ if (i >= len) score += hit? 1 : -penalty;
+ if (score > max) max = score, max_i = i;
+ else if (max - score > max_drop) break;
+ }
+ if (max >= min_score) {
+ printf("%s\t0\t%ld\t%ld\n", seq->name.s, max_i + 1, seq->seq.l);
+ sum_telo += max_i + 1;
+ st = max_i + 1;
+ }
+ score = max = 0, max_i = -1;
+ for (i = seq->seq.l - 1, l = 0, x = 0; i >= st; --i) { // 3'-end; similar to the for loop above
+ int hit = 0, c = seq_nt6_table[(uint8_t)seq->seq.s[i]];
+ if (c >= 1 && c <= 4) {
+ x = (x<<2 | (4-c)) & mask;
+ if (++l >= len && kh_get(64, h, x) != kh_end(h))
+ hit = 1;
+ } else l = 0, x = 0;
+ if (seq->seq.l - i >= len) score += hit? 1 : -penalty;
+ if (score > max) max = score, max_i = i;
+ else if (max - score > max_drop) break;
+ }
+ if (max >= min_score) {
+ printf("%s\t%ld\t%ld\t%ld\n", seq->name.s, max_i, seq->seq.l, seq->seq.l);
+ sum_telo += seq->seq.l - max_i;
+ }
+ }
+ kh_destroy(64, h);
+ kseq_destroy(seq);
+ gzclose(fp);
+ fprintf(stderr, "%ld\t%ld\n", (long)sum_telo, (long)sum_input);
+ return 0;
+}
+
/* main function */
static int usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: seqtk <command> <arguments>\n");
- fprintf(stderr, "Version: 1.3-r106\n\n");
+ fprintf(stderr, "Version: 1.4-r122\n\n");
fprintf(stderr, "Command: seq common transformation of FASTA/Q\n");
+ fprintf(stderr, " size report the number sequences and bases\n");
fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n");
fprintf(stderr, " sample subsample sequences\n");
fprintf(stderr, " subseq extract subsequences from FASTA/Q\n");
fprintf(stderr, " fqchk fastq QC (base/quality summary)\n");
fprintf(stderr, " mergepe interleave two PE FASTA/Q files\n");
+ fprintf(stderr, " split split one file into multiple smaller files\n");
fprintf(stderr, " trimfq trim FASTQ using the Phred algorithm\n\n");
fprintf(stderr, " hety regional heterozygosity\n");
fprintf(stderr, " gc identify high- or low-GC regions\n");
@@ -1692,7 +2036,10 @@ static int usage()
fprintf(stderr, " rename rename sequence names\n");
fprintf(stderr, " randbase choose a random base from hets\n");
fprintf(stderr, " cutN cut sequence at long N\n");
+ fprintf(stderr, " gap get the gap locations\n");
fprintf(stderr, " listhet extract the position of each het\n");
+ fprintf(stderr, " hpc homopolyer-compressed sequence\n");
+ fprintf(stderr, " telo identify telomere repeats in asm or long reads\n");
fprintf(stderr, "\n");
return 1;
}
@@ -1711,6 +2058,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "dropse") == 0) return stk_dropse(argc-1, argv+1);
else if (strcmp(argv[1], "randbase") == 0) return stk_randbase(argc-1, argv+1);
else if (strcmp(argv[1], "cutN") == 0) return stk_cutN(argc-1, argv+1);
+ else if (strcmp(argv[1], "gap") == 0) return stk_gap(argc-1, argv+1);
else if (strcmp(argv[1], "listhet") == 0) return stk_listhet(argc-1, argv+1);
else if (strcmp(argv[1], "famask") == 0) return stk_famask(argc-1, argv+1);
else if (strcmp(argv[1], "trimfq") == 0) return stk_trimfq(argc-1, argv+1);
@@ -1719,6 +2067,10 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "seq") == 0) return stk_seq(argc-1, argv+1);
else if (strcmp(argv[1], "kfreq") == 0) return stk_kfreq(argc-1, argv+1);
else if (strcmp(argv[1], "rename") == 0) return stk_rename(argc-1, argv+1);
+ else if (strcmp(argv[1], "split") == 0) return stk_split(argc-1, argv+1);
+ else if (strcmp(argv[1], "hpc") == 0) return stk_hpc(argc-1, argv+1);
+ else if (strcmp(argv[1], "size") == 0) return stk_size(argc-1, argv+1);
+ else if (strcmp(argv[1], "telo") == 0) return stk_telo(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]);
return 1;
View it on GitLab: https://salsa.debian.org/med-team/seqtk/-/compare/7ad44f4a0cbf2ab6dfcde476c6f0ce899d5d2fcd...8d06ce3787b386ab6151eec8a405ff8dca4e01e4
--
View it on GitLab: https://salsa.debian.org/med-team/seqtk/-/compare/7ad44f4a0cbf2ab6dfcde476c6f0ce899d5d2fcd...8d06ce3787b386ab6151eec8a405ff8dca4e01e4
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231109/fa10ca38/attachment-0001.htm>
More information about the debian-med-commit
mailing list