[med-svn] [Git][med-team/seqtk][upstream] New upstream version 1.4

Andreas Tille (@tille) gitlab at salsa.debian.org
Thu Nov 9 06:05:04 GMT 2023



Andreas Tille pushed to branch upstream at Debian Med / seqtk


Commits:
52284c3c by Andreas Tille at 2023-11-08T21:32:33+01:00
New upstream version 1.4
- - - - -


4 changed files:

- + NEWS.md
- README.md
- kseq.h
- seqtk.c


Changes:

=====================================
NEWS.md
=====================================
@@ -0,0 +1,26 @@
+Release 1.4-r122 (19 May 2023)
+------------------------------
+
+Notable changes:
+
+ * Improvement: faster FASTX parsing (#123)
+
+ * New feature: added the `telo` command to output telomere regions.
+
+ * New feature: added the `size` command to count the number of sequences and
+   the number of bases. Lighter and thus faster than `comp`.
+
+ * New feature: added the `hpc` command to compress homopolymers in input
+   sequences.
+
+ * New feature: added the `split` command to split a large input file into
+   multiple smaller files.
+
+ * New feature: added the `gap` command to output non-ACGT regions in the input
+   file.
+
+ * New feature: added option `-s` to command `subseq` to support the strand
+   field in BED. For the moment, this option does not work with other subseq
+   options.
+
+(1.4: 19 May 2023, r122)


=====================================
README.md
=====================================
@@ -59,3 +59,6 @@ Seqtk Examples
 
         seqtk trimfq -b 5 -e 10 in.fa > out.fa
 
+* Find telomere (TTAGGG)n repeats:
+
+        seqtk telo seq.fa > telo.bed 2> telo.count


=====================================
kseq.h
=====================================
@@ -108,8 +108,8 @@ typedef struct __kstring_t {
 				} else break; \
 			} \
 			if (delimiter == KS_SEP_LINE) { \
-				for (i = ks->begin; i < ks->end; ++i) \
-					if (ks->buf[i] == '\n') break; \
+				unsigned char *sep = (unsigned char*)memchr(ks->buf + ks->begin, '\n', ks->end - ks->begin); \
+				i = sep != NULL ? sep - ks->buf : ks->end; \
 			} else if (delimiter > KS_SEP_MAX) { \
 				for (i = ks->begin; i < ks->end; ++i) \
 					if (ks->buf[i] == delimiter) break; \


=====================================
seqtk.c
=====================================
@@ -1,6 +1,7 @@
 /* The MIT License
 
-   Copyright (c) 2008-2016 Broad Institute
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2008-2018 Broad Institute
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -41,6 +42,7 @@ KSEQ_INIT(gzFile, gzread)
 typedef struct {
 	int n, m;
 	uint64_t *a;
+	int8_t *rev;
 } reglist_t;
 
 #include "khash.h"
@@ -49,6 +51,65 @@ KHASH_SET_INIT_INT64(64)
 
 typedef kh_reg_t reghash_t;
 
+reghash_t *stk_reg_read_alt(const char *fn)
+{
+	reghash_t *h = kh_init(reg);
+	gzFile fp;
+	kstream_t *ks;
+	int dret;
+	kstring_t str = {0,0,0};
+
+	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+	if (fp == 0) return 0;
+	ks = ks_init(fp);
+	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
+		int i, c, st = -1, en = -1, rev = 0;
+		char *p, *q;
+		reglist_t *r;
+		khint_t k;
+		for (i = 0, p = q = str.s;; ++p) {
+			if (*p == '\t' || *p == '\0') {
+				c = *p;
+				*p = 0;
+				if (i == 1) {
+					if (isdigit(*q)) st = strtol(q, &q, 10);
+					if (q != p) st = -1;
+				} else if (i == 2) {
+					if (isdigit(*q)) en = strtol(q, &q, 10);
+					if (q != p) en = -1;
+				} else if (i == 5) {
+					if (*q == '+') rev = 1;
+					else if (*q == '-') rev = -1;
+				}
+				++i, q = p + 1;
+				if (c == 0) break;
+			}
+		}
+		if (i == 0) continue;
+		k = kh_get(reg, h, str.s);
+		if (k == kh_end(h)) {
+			int ret;
+			char *s = strdup(str.s);
+			k = kh_put(reg, h, s, &ret);
+			memset(&kh_val(h, k), 0, sizeof(reglist_t));
+		}
+		r = &kh_val(h, k);
+		if (en < 0 && st > 0) en = st, st = st - 1; // if there is only one column
+		if (st < 0) st = 0, en = INT_MAX;
+		if (r->n == r->m) {
+			r->m = r->m? r->m<<1 : 4;
+			r->a = (uint64_t*)realloc(r->a, r->m * 8);
+			r->rev = (int8_t*)realloc(r->rev, r->m);
+		}
+		r->a[r->n] = (uint64_t)st<<32 | en;
+		r->rev[r->n++] = rev;
+	}
+	ks_destroy(ks);
+	gzclose(fp);
+	free(str.s);
+	return h;
+}
+
 reghash_t *stk_reg_read(const char *fn)
 {
 	reghash_t *h = kh_init(reg);
@@ -106,6 +167,7 @@ void stk_reg_destroy(reghash_t *h)
 	for (k = 0; k < kh_end(h); ++k) {
 		if (kh_exist(h, k)) {
 			free(kh_val(h, k).a);
+			free(kh_val(h, k).rev);
 			free((char*)kh_key(h, k));
 		}
 	}
@@ -168,42 +230,43 @@ char comp_tab[] = {
 	'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
 };
 
-static void stk_printstr(const kstring_t *s, unsigned line_len)
+static void stk_printstr(FILE *fp, const kstring_t *s, unsigned line_len)
 {
 	if (line_len != UINT_MAX && line_len != 0) {
 		int i, rest = s->l;
 		for (i = 0; i < s->l; i += line_len, rest -= line_len) {
-			putchar('\n');
-			if (rest > line_len) fwrite(s->s + i, 1, line_len, stdout);
-			else fwrite(s->s + i, 1, rest, stdout);
+			fputc('\n', fp);
+			if (rest > line_len) fwrite(s->s + i, 1, line_len, fp);
+			else fwrite(s->s + i, 1, rest, fp);
 		}
-		putchar('\n');
+		fputc('\n', fp);
 	} else {
-		putchar('\n');
-		puts(s->s);
+		fputc('\n', fp);
+		fputs(s->s, fp);
+		fputc('\n', fp);
 	}
 }
 
-static inline void stk_printseq_renamed(const kseq_t *s, int line_len, const char *prefix, int64_t n)
+static inline void stk_printseq_renamed(FILE *fp, const kseq_t *s, int line_len, const char *prefix, int64_t n)
 {
-	putchar(s->qual.l? '@' : '>');
+	fputc(s->qual.l? '@' : '>', fp);
 	if (n >= 0) {
-		if (prefix) fputs(prefix, stdout);
-		printf("%lld", (long long)n);
-	} else fputs(s->name.s, stdout);
+		if (prefix) fputs(prefix, fp);
+		fprintf(fp, "%lld", (long long)n);
+	} else fputs(s->name.s, fp);
 	if (s->comment.l) {
-		putchar(' '); fputs(s->comment.s, stdout);
+		fputc(' ', fp); fputs(s->comment.s, fp);
 	}
-	stk_printstr(&s->seq, line_len);
+	stk_printstr(fp, &s->seq, line_len);
 	if (s->qual.l) {
-		putchar('+');
-		stk_printstr(&s->qual, line_len);
+		fputc('+', fp);
+		stk_printstr(fp, &s->qual, line_len);
 	}
 }
 
-static inline void stk_printseq(const kseq_t *s, int line_len)
+static inline void stk_printseq(FILE *fp, const kseq_t *s, int line_len)
 {
-	stk_printseq_renamed(s, line_len, 0, -1);
+	stk_printseq_renamed(fp, s, line_len, 0, -1);
 }
 
 /* 
@@ -396,9 +459,9 @@ int stk_comp(int argc, char *argv[])
 		}
 		for (k = 0; p && k < p->n; ++k) {
 			int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
-			int la, lb, lc, na, nb, nc, cnt[11];
-			if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
-			else la = 'a', lb = -1, lc = 0;
+			int la, lb, na, nb, nc, cnt[11];
+			if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la];
+			else la = 'a', lb = -1;
 			na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
 			memset(cnt, 0, 11 * sizeof(int));
 			for (i = beg; i < end; ++i) {
@@ -422,7 +485,7 @@ int stk_comp(int argc, char *argv[])
 						if (b == 10 || b == 5) ++cnt[10];
 					}
 				}
-				la = a; lb = b; lc = c;
+				la = a; lb = b;
 			}
 			if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
 			else printf("%s\t%d", seq->name.s, l);
@@ -541,6 +604,42 @@ int stk_hety(int argc, char *argv[])
 	return 0;
 }
 
+int stk_gap(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int len, c, min_size = 50;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: seqtk gap [-l %d] <in.fa>\n", min_size);
+		return 1;
+	}
+	while ((c = getopt(argc, argv, "l:")) >= 0) {
+		switch (c) {
+		case 'l': min_size = atoi(optarg); break;
+		}
+	}
+	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+	if (fp == 0) {
+		fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+		return 1;
+	}
+	seq = kseq_init(fp);
+	while ((len = kseq_read(seq)) >= 0) {
+		int i, l;
+		for (i = l = 0; i <= len; ++i) {
+			c = i < len? seq_nt6_table[(uint8_t)seq->seq.s[i]] : 5;
+			if (i == len || (c >= 1 && c <= 4)) {
+				if (l > 0 && l >= min_size)
+					printf("%s\t%d\t%d\n", seq->name.s, i - l, i);
+				l = 0;
+			} else ++l;
+		}
+	}
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
+
 /* subseq */
 
 int stk_subseq(int argc, char *argv[])
@@ -548,23 +647,28 @@ int stk_subseq(int argc, char *argv[])
 	khash_t(reg) *h = kh_init(reg);
 	gzFile fp;
 	kseq_t *seq;
-	int l, i, j, c, is_tab = 0, line = 0;
+	int l, i, j, c, is_tab = 0, line = 0, do_strand = 0;
+	char *seq_buf = 0;
+	uint32_t seq_max = 0;
 	khint_t k;
-	while ((c = getopt(argc, argv, "tl:")) >= 0) {
+	while ((c = getopt(argc, argv, "tl:s")) >= 0) {
 		switch (c) {
 		case 't': is_tab = 1; break;
+		case 's': do_strand = 1; break;
 		case 'l': line = atoi(optarg); break;
 		}
 	}
 	if (optind + 2 > argc) {
-		fprintf(stderr, "\n");
-		fprintf(stderr, "Usage:   seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n\n");
-		fprintf(stderr, "Options: -t       TAB delimited output\n");
-		fprintf(stderr, "         -l INT   sequence line length [%d]\n\n", line);
-		fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n");
+		fprintf(stderr, "Usage:   seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n");
+		fprintf(stderr, "Options:\n");
+		fprintf(stderr, "  -t       TAB delimited output\n");
+		fprintf(stderr, "  -s       strand aware\n");
+		fprintf(stderr, "  -l INT   sequence line length [%d]\n", line);
+		fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n");
 		return 1;
 	}
-	h = stk_reg_read(argv[optind+1]);
+	if (do_strand) h = stk_reg_read_alt(argv[optind+1]);
+	else h = stk_reg_read(argv[optind+1]);
 	if (h == 0) {
 		fprintf(stderr, "[E::%s] failed to read the list of regions in file '%s'\n", __func__, argv[optind+1]);
 		return 1;
@@ -598,21 +702,37 @@ int stk_subseq(int argc, char *argv[])
 				if (seq->comment.l) printf(" %s", seq->comment.s);
 			} else printf("%s\t%d\t", seq->name.s, beg + 1);
 			if (end > seq->seq.l) end = seq->seq.l;
-			for (j = 0; j < end - beg; ++j) {
-				if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n');
-				putchar(seq->seq.s[j + beg]);
-			}
-			putchar('\n');
-			if (seq->qual.l != seq->seq.l || is_tab) continue;
-			printf("+");
-			for (j = 0; j < end - beg; ++j) {
-				if (j == 0 || (line > 0 && j % line == 0)) putchar('\n');
-				putchar(seq->qual.s[j + beg]);
+			if (do_strand && p->rev) { // TODO: the strand mode only works with FASTA
+				if (end - beg >= seq_max) {
+					seq_max = end - beg;
+					kroundup32(seq_max);
+					seq_buf = (char*)realloc(seq_buf, seq_max);
+				}
+				if (p->rev[i] < 0)
+					for (j = end - 1; j >= beg; --j)
+						seq_buf[end - 1 - j] = (uint8_t)seq->seq.s[j] >= 128? 'N' : comp_tab[(uint8_t)seq->seq.s[j]];
+				else memcpy(seq_buf, &seq->seq.s[beg], end - beg);
+				putchar('\n');
+				fwrite(seq_buf, 1, end - beg, stdout);
+				putchar('\n');
+			} else {
+				for (j = 0; j < end - beg; ++j) {
+					if (is_tab == 0 && (j == 0 || (line > 0 && j % line == 0))) putchar('\n');
+					putchar(seq->seq.s[j + beg]);
+				}
+				putchar('\n');
+				if (seq->qual.l != seq->seq.l || is_tab) continue;
+				printf("+");
+				for (j = 0; j < end - beg; ++j) {
+					if (j == 0 || (line > 0 && j % line == 0)) putchar('\n');
+					putchar(seq->qual.s[j + beg]);
+				}
+				putchar('\n');
 			}
-			putchar('\n');
 		}
 	}
 	// free
+	free(seq_buf);
 	kseq_destroy(seq);
 	gzclose(fp);
 	stk_reg_destroy(h);
@@ -876,6 +996,54 @@ int stk_listhet(int argc, char *argv[])
 	return 0;
 }
 
+int stk_split(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int c, i, l, n = 10, len = 0;
+	char *prefix, *fn;
+	FILE **out;
+	while ((c = getopt(argc, argv, "n:l:")) >= 0) {
+		if (c == 'n') n = atoi(optarg);
+		else if (c == 'l') len = atoi(optarg);
+	}
+	if (argc == optind) {
+		fprintf(stderr, "Usage: seqtk split [options] <prefix> <in.fa>\n");
+		fprintf(stderr, "Options:\n");
+		fprintf(stderr, "  -n INT    number of files [%d]\n", n);
+		fprintf(stderr, "  -l INT    line length [%d]\n", len);
+		return 1;
+	}
+	prefix = argv[optind];
+	fp = (strcmp(argv[optind+1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind+1], "r");
+	if (fp == 0) {
+		fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+		return 1;
+	}
+	out = (FILE**)malloc(sizeof(FILE*) * n);
+	fn = (char*)malloc(strlen(prefix) + 10);
+	for (i = 0; i < n; ++i) {
+		sprintf(fn, "%s.%.5d.fa", prefix, i + 1);
+		out[i] = fopen(fn, "w+");
+		if (out[i] == 0) {
+			fprintf(stderr, "ERROR: failed to create file %s\n", fn);
+			exit(1);
+		}
+	}
+	free(fn);
+	seq = kseq_init(fp);
+	i = 0;
+	while ((l = kseq_read(seq)) >= 0) {
+		stk_printseq(out[i % n], seq, len);
+		++i;
+	}
+	for (i = 0; i < n; ++i) fclose(out[i]);
+	free(out);
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
+
 /* cutN */
 static int cutN_min_N_tract = 1000;
 static int cutN_nonN_penalty = 10;
@@ -1072,11 +1240,11 @@ int stk_sample(int argc, char *argv[])
 			if (num) {
 				uint64_t y = n_seqs - 1 < num? n_seqs - 1 : (uint64_t)(r * n_seqs);
 				if (y < num) cpy_kseq(&buf[y], seq);
-			} else if (r < frac) stk_printseq(seq, UINT_MAX);
+			} else if (r < frac) stk_printseq(stdout, seq, UINT_MAX);
 		}
 		for (i = 0; i < num; ++i) {
 			kseq_t *p = &buf[i];
-			if (p->seq.l) stk_printseq(p, UINT_MAX);
+			if (p->seq.l) stk_printseq(stdout, p, UINT_MAX);
 			free(p->seq.s); free(p->qual.s); free(p->name.s);
 		}
 		if (buf != NULL) free(buf);
@@ -1119,7 +1287,7 @@ int stk_sample(int argc, char *argv[])
 		n_seqs = 0;
 		while (kseq_read(seq) >= 0)
 			if (kh_get(64, hash, ++n_seqs) != kh_end(hash))
-				stk_printseq(seq, UINT_MAX);
+				stk_printseq(stdout, seq, UINT_MAX);
 		kh_destroy(64, hash);
 	}
 
@@ -1186,7 +1354,7 @@ int stk_seq(int argc, char *argv[])
 	khash_t(reg) *h = 0;
 	krand_t *kr = 0;
 
-	while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:SF:")) >= 0) {
+	while ((c = getopt(argc, argv, "N12q:l:Q:aACrn:s:f:M:L:cVUX:SF:x")) >= 0) {
 		switch (c) {
 			case 'a':
 			case 'A': flag |= 1; break;
@@ -1199,6 +1367,7 @@ int stk_seq(int argc, char *argv[])
 			case 'N': flag |= 128; break;
 			case 'U': flag |= 256; break;
 			case 'S': flag |= 512; break;
+			case 'x': flag |= 1024; break;
 			case 'M': h = stk_reg_read(optarg); break;
 			case 'n': mask_chr = *optarg; break;
 			case 'Q': qual_shift = atoi(optarg); break;
@@ -1234,6 +1403,7 @@ int stk_seq(int argc, char *argv[])
 		fprintf(stderr, "         -2        output the 2n reads only\n");
 		fprintf(stderr, "         -V        shift quality by '(-Q) - 33'\n");
 		fprintf(stderr, "         -U        convert all bases to uppercases\n");
+		fprintf(stderr, "         -x        convert all lowercases to -n\n");
 		fprintf(stderr, "         -S        strip of white spaces in sequences\n");
 		fprintf(stderr, "\n");
 		free(kr);
@@ -1282,6 +1452,9 @@ int stk_seq(int argc, char *argv[])
 		if (flag & 256) // option -U: convert to uppercases
 			for (i = 0; i < seq->seq.l; ++i)
 				seq->seq.s[i] = toupper(seq->seq.s[i]);
+		else if ((flag & 1024) && mask_chr > 0)
+			for (i = 0; i < seq->seq.l; ++i)
+				seq->seq.s[i] = islower(seq->seq.s[i])? mask_chr : seq->seq.s[i];
 		if (flag & 1) seq->qual.l = 0; // option -a: fastq -> fasta
 		else if (fake_qual >= 33 && fake_qual <= 127) {
 			if (seq->qual.m < seq->seq.m) {
@@ -1317,7 +1490,7 @@ int stk_seq(int argc, char *argv[])
 				if (seq_nt16to4_table[seq_nt16_table[(int)seq->seq.s[i]]] > 3) break;
 			if (i < seq->seq.l) continue;
 		}
-		stk_printseq(seq, line_len);
+		stk_printseq(stdout, seq, line_len);
 	}
 	kseq_destroy(seq);
 	gzclose(fp);
@@ -1409,8 +1582,8 @@ int stk_mergepe(int argc, char *argv[])
 			fprintf(stderr, "[W::%s] the 2nd file has fewer records.\n", __func__);
 			break;
 		}
-		stk_printseq(seq[0], 0);
-		stk_printseq(seq[1], 0);
+		stk_printseq(stdout, seq[0], 0);
+		stk_printseq(stdout, seq[1], 0);
 	}
 	if (kseq_read(seq[1]) >= 0)
 		fprintf(stderr, "[W::%s] the 1st file has fewer records.\n", __func__);
@@ -1445,8 +1618,8 @@ int stk_dropse(int argc, char *argv[])
 				is_diff = strncmp(p->s, q->s, l);
 			} else is_diff = 1;
 			if (!is_diff) {
-				stk_printseq(&last, 0);
-				stk_printseq(seq, 0);
+				stk_printseq(stdout, &last, 0);
+				stk_printseq(stdout, seq, 0);
 				last.name.l = 0;
 			} else cpy_kseq(&last, seq);
 		} else cpy_kseq(&last, seq);
@@ -1458,6 +1631,59 @@ int stk_dropse(int argc, char *argv[])
 	return 0;
 }
 
+static inline int kputc(int c, kstring_t *s)
+{
+	if (s->l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	s->s[s->l++] = c;
+	s->s[s->l] = 0;
+	return c;
+}
+
+int stk_hpc(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	kstring_t str = {0,0,0};
+
+	if (argc == 1 && isatty(fileno(stdin))) {
+		fprintf(stderr, "Usage: seqtk hpc <in.fq>\n");
+		return 1;
+	}
+	fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+	if (fp == 0) {
+		fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+		return 1;
+	}
+	seq = kseq_init(fp);
+
+	while (kseq_read(seq) >= 0) {
+		int i, last;
+		str.l = 0;
+		if (seq->seq.l == 0) continue;
+		for (i = 1, last = 0; i <= seq->seq.l; ++i) {
+			if (i == seq->seq.l || seq->seq.s[last] != seq->seq.s[i]) {
+				kputc(seq->seq.s[last], &str);
+				last = i;
+			}
+		}
+		putchar('>'); puts(seq->name.s);
+		puts(str.s);
+	}
+
+	kseq_destroy(seq);
+	gzclose(fp);
+	free(str.s);
+	return 0;
+}
+
 int stk_rename(int argc, char *argv[])
 {
 	gzFile fp;
@@ -1487,18 +1713,18 @@ int stk_rename(int argc, char *argv[])
 				is_diff = strncmp(p->s, q->s, l);
 			} else is_diff = 1;
 			if (!is_diff) {
-				stk_printseq_renamed(&last, 0, prefix, n);
-				stk_printseq_renamed(seq,   0, prefix, n);
+				stk_printseq_renamed(stdout, &last, 0, prefix, n);
+				stk_printseq_renamed(stdout, seq,   0, prefix, n);
 				last.name.l = 0;
 				++n;
 			} else {
-				stk_printseq_renamed(&last, 0, prefix, n);
+				stk_printseq_renamed(stdout, &last, 0, prefix, n);
 				++n;
 				cpy_kseq(&last, seq);
 			}
 		} else cpy_kseq(&last, seq);
 	}
-	if (last.name.l) stk_printseq_renamed(&last, 0, prefix, n);
+	if (last.name.l) stk_printseq_renamed(stdout, &last, 0, prefix, n);
 
 	kseq_destroy(seq);
 	gzclose(fp);
@@ -1670,18 +1896,136 @@ int stk_fqchk(int argc, char *argv[])
 	return 0;
 }
 
+int stk_size(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	uint64_t n = 0, l = 0;
+	if (argc == 1 && isatty(fileno(stdin))) {
+		fprintf(stderr, "Usage: seqtk size <in.fq>\n");
+		return 1;
+	}
+	fp = argc > 1 && strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+	if (fp == 0) {
+		fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+		return 1;
+	}
+	seq = kseq_init(fp);
+	while (kseq_read(seq) >= 0) {
+		++n;
+		l += seq->seq.l;
+	}
+	kseq_destroy(seq);
+	gzclose(fp);
+	printf("%lld\t%lld\n", (long long)n, (long long)l);
+	return 0;
+}
+
+int stk_telo(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	char *telo_seq = "CCCTAA";
+	int c, i, j, len, absent, penalty = 1, max_drop = 2000, min_score = 300;
+	uint64_t x, mask, sum_input = 0, sum_telo = 0;
+	khash_t(64) *h;
+
+	while ((c = getopt(argc, argv, "m:p:d:s:")) >= 0) {
+		if (c == 'm') telo_seq = optarg;
+		else if (c == 'p') penalty = atoi(optarg);
+		else if (c == 'd') max_drop = atoi(optarg);
+		else if (c == 's') min_score = atoi(optarg);
+	}
+	if (penalty < 0) penalty = -penalty;
+	if (argc == optind && isatty(fileno(stdin))) {
+		fprintf(stderr, "Usage: seqtk telo [options] <in.fq>\n");
+		fprintf(stderr, "Options:\n");
+		fprintf(stderr, "  -m STR     motif [%s]\n", telo_seq);
+		fprintf(stderr, "  -p INT     penalty [%d]\n", penalty);
+		fprintf(stderr, "  -d INT     max drop [%d]\n", max_drop);
+		fprintf(stderr, "  -s INT     min score [%d]\n", min_score);
+		return 1;
+	}
+
+	len = strlen(telo_seq);
+	mask = (1ULL<<2*len) - 1;
+
+	h = kh_init(64); // hash table for all roations of the telomere motif
+	kh_resize(64, h, len * 2);
+	for (i = 0; i < len; ++i) {
+		for (j = 0, x = 0; j < len; ++j) {
+			int c = seq_nt6_table[(uint8_t)telo_seq[(i + j) % len]];
+			assert(c >= 1 && c <= 4);
+			x = x<<2 | (c-1);
+		}
+		kh_put(64, h, x, &absent);
+	}
+
+	fp = argc > 1 && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+	if (fp == 0) {
+		fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
+		return 1;
+	}
+	seq = kseq_init(fp);
+	while (kseq_read(seq) >= 0) {
+		ssize_t i, l, max_i = -1, st = 0;
+		int64_t score, max;
+		sum_input += seq->seq.l;
+		score = max = 0, max_i = -1;
+		for (i = 0, l = 0, x = 0; i < seq->seq.l; ++i) { // 5'-end, check CCCTAA
+			int hit = 0, c = seq_nt6_table[(uint8_t)seq->seq.s[i]];
+			if (c >= 1 && c <= 4) { // not N
+				x = (x<<2 | (c-1)) & mask;
+				if (++l >= len && kh_get(64, h, x) != kh_end(h)) // x is at least 6bp long and is a telomere motif
+					hit = 1;
+			} else l = 0, x = 0; // N, ambiguous base
+			if (i >= len) score += hit? 1 : -penalty;
+			if (score > max) max = score, max_i = i;
+			else if (max - score > max_drop) break;
+		}
+		if (max >= min_score) {
+			printf("%s\t0\t%ld\t%ld\n", seq->name.s, max_i + 1, seq->seq.l);
+			sum_telo += max_i + 1;
+			st = max_i + 1;
+		}
+		score = max = 0, max_i = -1;
+		for (i = seq->seq.l - 1, l = 0, x = 0; i >= st; --i) { // 3'-end; similar to the for loop above
+			int hit = 0, c = seq_nt6_table[(uint8_t)seq->seq.s[i]];
+			if (c >= 1 && c <= 4) {
+				x = (x<<2 | (4-c)) & mask;
+				if (++l >= len && kh_get(64, h, x) != kh_end(h))
+					hit = 1;
+			} else l = 0, x = 0;
+			if (seq->seq.l - i >= len) score += hit? 1 : -penalty;
+			if (score > max) max = score, max_i = i;
+			else if (max - score > max_drop) break;
+		}
+		if (max >= min_score) {
+			printf("%s\t%ld\t%ld\t%ld\n", seq->name.s, max_i, seq->seq.l, seq->seq.l);
+			sum_telo += seq->seq.l - max_i;
+		}
+	}
+	kh_destroy(64, h);
+	kseq_destroy(seq);
+	gzclose(fp);
+	fprintf(stderr, "%ld\t%ld\n", (long)sum_telo, (long)sum_input);
+	return 0;
+}
+
 /* main function */
 static int usage()
 {
 	fprintf(stderr, "\n");
 	fprintf(stderr, "Usage:   seqtk <command> <arguments>\n");
-	fprintf(stderr, "Version: 1.3-r106\n\n");
+	fprintf(stderr, "Version: 1.4-r122\n\n");
 	fprintf(stderr, "Command: seq       common transformation of FASTA/Q\n");
+	fprintf(stderr, "         size      report the number sequences and bases\n");
 	fprintf(stderr, "         comp      get the nucleotide composition of FASTA/Q\n");
 	fprintf(stderr, "         sample    subsample sequences\n");
 	fprintf(stderr, "         subseq    extract subsequences from FASTA/Q\n");
 	fprintf(stderr, "         fqchk     fastq QC (base/quality summary)\n");
 	fprintf(stderr, "         mergepe   interleave two PE FASTA/Q files\n");
+	fprintf(stderr, "         split     split one file into multiple smaller files\n");
 	fprintf(stderr, "         trimfq    trim FASTQ using the Phred algorithm\n\n");
 	fprintf(stderr, "         hety      regional heterozygosity\n");
 	fprintf(stderr, "         gc        identify high- or low-GC regions\n");
@@ -1692,7 +2036,10 @@ static int usage()
 	fprintf(stderr, "         rename    rename sequence names\n");
 	fprintf(stderr, "         randbase  choose a random base from hets\n");
 	fprintf(stderr, "         cutN      cut sequence at long N\n");
+	fprintf(stderr, "         gap       get the gap locations\n");
 	fprintf(stderr, "         listhet   extract the position of each het\n");
+	fprintf(stderr, "         hpc       homopolyer-compressed sequence\n");
+	fprintf(stderr, "         telo      identify telomere repeats in asm or long reads\n");
 	fprintf(stderr, "\n");
 	return 1;
 }
@@ -1711,6 +2058,7 @@ int main(int argc, char *argv[])
 	else if (strcmp(argv[1], "dropse") == 0) return stk_dropse(argc-1, argv+1);
 	else if (strcmp(argv[1], "randbase") == 0) return stk_randbase(argc-1, argv+1);
 	else if (strcmp(argv[1], "cutN") == 0) return stk_cutN(argc-1, argv+1);
+	else if (strcmp(argv[1], "gap") == 0) return stk_gap(argc-1, argv+1);
 	else if (strcmp(argv[1], "listhet") == 0) return stk_listhet(argc-1, argv+1);
 	else if (strcmp(argv[1], "famask") == 0) return stk_famask(argc-1, argv+1);
 	else if (strcmp(argv[1], "trimfq") == 0) return stk_trimfq(argc-1, argv+1);
@@ -1719,6 +2067,10 @@ int main(int argc, char *argv[])
 	else if (strcmp(argv[1], "seq") == 0) return stk_seq(argc-1, argv+1);
 	else if (strcmp(argv[1], "kfreq") == 0) return stk_kfreq(argc-1, argv+1);
 	else if (strcmp(argv[1], "rename") == 0) return stk_rename(argc-1, argv+1);
+	else if (strcmp(argv[1], "split") == 0) return stk_split(argc-1, argv+1);
+	else if (strcmp(argv[1], "hpc") == 0) return stk_hpc(argc-1, argv+1);
+	else if (strcmp(argv[1], "size") == 0) return stk_size(argc-1, argv+1);
+	else if (strcmp(argv[1], "telo") == 0) return stk_telo(argc-1, argv+1);
 	else {
 		fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]);
 		return 1;



View it on GitLab: https://salsa.debian.org/med-team/seqtk/-/commit/52284c3c8ad9e69e87381cc8e2e897e796aec55c

-- 
View it on GitLab: https://salsa.debian.org/med-team/seqtk/-/commit/52284c3c8ad9e69e87381cc8e2e897e796aec55c
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231109/b06c0ac7/attachment-0001.htm>


More information about the debian-med-commit mailing list