[med-svn] [Git][med-team/ncbi-entrez-direct][master] 5 commits: New upstream version 14.6.20210209+dfsg

Aaron M. Ucko gitlab at salsa.debian.org
Fri Feb 12 22:45:37 GMT 2021



Aaron M. Ucko pushed to branch master at Debian Med / ncbi-entrez-direct


Commits:
bfa1bdb4 by Aaron M. Ucko at 2021-02-12T12:06:11-05:00
New upstream version 14.6.20210209+dfsg
- - - - -
c8043a71 by Aaron M. Ucko at 2021-02-12T12:34:48-05:00
Merge tag 'upstream/14.6.20210209+dfsg'

Upstream version 14.6.20210209(+dfsg).

- - - - -
6e85df44 by Aaron M. Ucko at 2021-02-12T12:35:10-05:00
debian/man/xtract.1: Tune hyphenation under -insd.

- - - - -
30083842 by Aaron M. Ucko at 2021-02-12T12:35:22-05:00
debian/man/*.1: Update for new release (14.6.20210209[+dfsg]).

* transmute.1: New -extract option (Sequence Editing).
* xtract.1: New -insd qualifier type feat_location.

- - - - -
baad3cc2 by Aaron M. Ucko at 2021-02-12T12:35:33-05:00
Finalize ncbi-entrez-direct 14.6.20210209+dfsg-1 for unstable.

These changes are small enough for the soft freeze, and could have
beat the freeze if I hadn't waited for previous upload to migrate.
(golang-1.15 formally blocked it.)

- - - - -


5 changed files:

- debian/changelog
- debian/man/transmute.1
- debian/man/xtract.1
- transmute.go
- xtract.go


Changes:

=====================================
debian/changelog
=====================================
@@ -1,3 +1,11 @@
+ncbi-entrez-direct (14.6.20210209+dfsg-1) unstable; urgency=medium
+
+  * New upstream release.
+  * debian/man/{transmute,xtract}.1: Update for new release.
+  * debian/man/xtract.1: Tune hyphenation under -insd.
+
+ -- Aaron M. Ucko <ucko at debian.org>  Fri, 12 Feb 2021 12:31:49 -0500
+
 ncbi-entrez-direct (14.6.20210203+dfsg-2) unstable; urgency=medium
 
   * Standards-Version: 4.5.1 (routine-update)


=====================================
debian/man/transmute.1
=====================================
@@ -1,4 +1,4 @@
-.TH TRANSMUTE 1 2021-02-06 NCBI "NCBI Entrez Direct User's Manual"
+.TH TRANSMUTE 1 2021-02-12 NCBI "NCBI Entrez Direct User's Manual"
 .SH NAME
 align\-columns, gbf2xml, transmute \- transform (NCBI Entrez Direct) data
 .SH SYNOPSIS
@@ -45,6 +45,8 @@ align\-columns, gbf2xml, transmute \- transform (NCBI Entrez Direct) data
 [\|\fB\-delete\fP\ \fIN\fP\|]
 [\|\fB\-insert\fP\ \fIseq\fP\|]
 
+\fBtransmute\fP \fB\-extract\fP\ \fIfeat_loc\fP
+
 \fBtransmute\fP \fB\-cds2prot\fP
 [\|\fB\-code\fP\ \fIN\fP\|]
 [\|\fB\-frame\fP\ \fIN\fP\|]
@@ -237,6 +239,9 @@ Delete \fIN\fP bases or residues.
 Insert given sequence.
 .RE
 .PD
+.TP 10
+\fB\-extract\fP\ \fIfeat_loc\fP
+Use \fBxtract \-insd\fP ... \fBfeat_location\fP instructions.
 .SS Sequence Processing
 .TP 10
 \fB\-cds2prot\fP


=====================================
debian/man/xtract.1
=====================================
@@ -1,4 +1,4 @@
-.TH XTRACT 1 2021-02-06 NCBI "NCBI Entrez Direct User's Manual"
+.TH XTRACT 1 2021-02-12 NCBI "NCBI Entrez Direct User's Manual"
 .SH NAME
 xtract \- NCBI Entrez Direct XML conversion and transformation tool
 .SH SYNOPSIS
@@ -541,18 +541,20 @@ Print them if invoked standalone;
 run them if invoked as part of a pipeline.
 Requires one or more arguments,
 which may appear in the following order:
+.nh
 .RS
 .\".PD 0
 .IP Descriptor(s) 15
-.BR INSDSeq_sequence / INSDSeq_definition / INSDSeq_division "/... [\|...\|]"
+.BR INSDSeq_sequence / INSDSeq_definition /\: INSDSeq_division "/... [\|...\|]"
 .IP Completeness 15
 .BR complete / partial
 .IP Feature(s) 15
 .BR CDS / mRNA /...[\| , ...\|]
 .IP Qualifier(s)
-.BR INSDFeature_key / \(dq#INSDInterval\(dq / gene / product / sub_sequence "/... [\|...\|]"
+.BR INSDFeature_key / \(dq#INSDInterval\(dq / gene / product /\: feat_location / sub_sequence "/... [\|...\|]"
 .\".PD
 .RE
+.hy 1
 .SS Frequency Table
 .TP
 \fB\-histogram\fP


=====================================
transmute.go
=====================================
@@ -155,6 +155,8 @@ Sequence Editing
     -delete      Delete N bases
     -insert      Insert given sequence
 
+  -extract     Use xtract -insd feat_location instructions
+
 Sequence Processing
 
   -cds2prot    Translate coding region into protein
@@ -274,14 +276,14 @@ Mitochondrial Mistranslation
 
   efetch -db nuccore -id NC_012920 -format gb |
   transmute -g2x |
-  xtract -insd CDS gene product translation sub_sequence |
-  while IFS=$'\t' read acc gene prod prot seq
+  xtract -insd CDS gene product protein_id translation sub_sequence |
+  while IFS=$'\t' read acc gene prod prid prot seq
   do
     mito=$( echo "$seq" | transmute -cds2prot -code 2 -stop )
     norm=$( echo "$seq" | transmute -cds2prot -code 1 -stop )
     if [ "$mito" != "$norm" ]
     then
-      echo ">$acc $gene $prod"
+      echo ">$acc $gene $prid $prod"
       transmute -diff <( echo "$mito" ) <( echo "$norm" )
       echo ""
     fi
@@ -4639,8 +4641,8 @@ func SequenceRemove(inp io.Reader, args []string) {
 		return
 	}
 
-	first := 0
-	last := 0
+	first := ""
+	last := ""
 
 	// skip past command name
 	args = args[1:]
@@ -4649,10 +4651,12 @@ func SequenceRemove(inp io.Reader, args []string) {
 
 		switch args[0] {
 		case "-first":
-			first = GetNumericArg(args, "Bases to delete at beginning", 0, -1, -1)
+			first = GetStringArg(args, "Bases to delete at beginning")
+			first = strings.ToUpper(first)
 			args = args[2:]
 		case "-last":
-			last = GetNumericArg(args, "Bases to delete at end", 0, -1, -1)
+			last = GetStringArg(args, "Bases to delete at end")
+			last = strings.ToUpper(last)
 			args = args[2:]
 		default:
 			fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized option after -remove command\n")
@@ -4663,21 +4667,63 @@ func SequenceRemove(inp io.Reader, args []string) {
 	str := ReadAllIntoSequence(inp)
 
 	ln := len(str)
-	if first > 0 {
-		if first <= ln {
-			str = str[first:]
-			ln = len(str)
-		} else {
-			fmt.Fprintf(os.Stderr, "\nERROR: -first argument %d is greater than sequence length %d\n", first, ln)
-			str = ""
+
+	if IsAllDigits(first) {
+		val, err := strconv.Atoi(first)
+		if err == nil && val > 0 {
+			if val <= ln {
+				str = str[val:]
+				ln = len(str)
+			} else {
+				fmt.Fprintf(os.Stderr, "\nERROR: -first argument %d is greater than sequence length %d\n", val, ln)
+				str = ""
+			}
+		}
+	} else {
+		val := len(first)
+		if val > 0 {
+			if val <= ln {
+				// warn if existing sequence does not match deletion argument
+				ext := str[:val]
+				if first != ext {
+					fmt.Fprintf(os.Stderr, "\nWARNING: -first argument %s does not match existing sequence %s\n", first, ext)
+				}
+				// delete characters
+				str = str[val:]
+				ln = len(str)
+			} else {
+				fmt.Fprintf(os.Stderr, "\nERROR: -first argument %d is greater than sequence length %d\n", val, ln)
+				str = ""
+			}
 		}
 	}
-	if last > 0 {
-		if last <= ln {
-			str = str[:ln-last]
-		} else {
-			fmt.Fprintf(os.Stderr, "\nERROR: -last argument %d is greater than sequence length %d\n", last, ln)
-			str = ""
+
+	if IsAllDigits(last) {
+		val, err := strconv.Atoi(last)
+		if err == nil && val > 0 {
+			if val <= ln {
+				str = str[:ln-val]
+			} else {
+				fmt.Fprintf(os.Stderr, "\nERROR: -last argument %d is greater than remaining sequence length %d\n", val, ln)
+				str = ""
+			}
+		}
+	} else {
+		val := len(last)
+		if val > 0 {
+			if val <= ln {
+				// warn if existing sequence does not match deletion argument
+				ext := str[ln-val:]
+				if last != ext {
+					fmt.Fprintf(os.Stderr, "\nWARNING: -last argument %s does not match existing sequence %s\n", last, ext)
+				}
+				// delete characters
+				str = str[:ln-val]
+				ln = len(str)
+			} else {
+				fmt.Fprintf(os.Stderr, "\nERROR: -last argument %d is greater than sequence length %d\n", val, ln)
+				str = ""
+			}
 		}
 	}
 
@@ -4846,6 +4892,77 @@ func SequenceReplace(inp io.Reader, args []string) {
 	}
 }
 
+func SequenceExtract(inp io.Reader, args []string) {
+
+	if inp == nil {
+		return
+	}
+
+	// skip past command name
+	args = args[1:]
+
+	if len(args) < 1 {
+		fmt.Fprintf(os.Stderr, "\nERROR: Missing argument after -extract command\n")
+		os.Exit(1)
+	}
+
+	// read output of xtract -insd feat_location qualifier
+	feat_loc := args[0]
+
+	str := ReadAllIntoSequence(inp)
+
+	ln := len(str)
+
+	// split intervals, e.g., "201..224,1550..1920,1986..2085,2317..2404,2466..2629"
+	comma := strings.Split(feat_loc, ",")
+
+	for _, item := range comma {
+
+		// also allow dash separator, e.g., "201-224,1550-1920,1986-2085,2317-2404,2466-2629"
+		item = strings.Replace(item, "-", "..", -1)
+
+		fr, to := SplitInTwoAt(item, "..", LEFT)
+
+		fr = strings.TrimSpace(fr)
+		to = strings.TrimSpace(to)
+
+		min, err := strconv.Atoi(fr)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized number '%s'\n", fr)
+			os.Exit(1)
+		}
+		if min < 1 || min > ln {
+			fmt.Fprintf(os.Stderr, "\nERROR: Starting point '%s' out of range\n", fr)
+			os.Exit(1)
+		}
+
+		max, err := strconv.Atoi(to)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "\nERROR: Unrecognized number '%s'\n", to)
+			os.Exit(1)
+		}
+		if max < 1 || max > ln {
+			fmt.Fprintf(os.Stderr, "\nERROR: Ending point '%s' out of range\n", to)
+			os.Exit(1)
+		}
+
+		if min < max {
+			min--
+			sub := str[min:max]
+			os.Stdout.WriteString(sub)
+		} else if min > max {
+			max--
+			sub := str[max:min]
+			sub = ReverseComplement(sub)
+			os.Stdout.WriteString(sub)
+		} else {
+			// need more information to know strand if single point
+		}
+	}
+
+	os.Stdout.WriteString("\n")
+}
+
 // REVERSE SEQUENCE
 
 // SeqFlip reverses without complementing - e.g., minus strand proteins translated in reverse order
@@ -4958,7 +5075,7 @@ func PrintFastaPairs(frst, scnd string) {
 	// print in blocks of 50 bases or residues
 	for i := 0; i < mx; i += 50 {
 		dl := 50
-		if mx -i < 50 {
+		if mx-i < 50 {
 			dl = mx - i
 		}
 		lf := fs[:dl]
@@ -8344,6 +8461,8 @@ func main() {
 		SequenceRetain(in, args)
 	case "-replace":
 		SequenceReplace(in, args)
+	case "-extract":
+		SequenceExtract(in, args)
 	case "-revcomp":
 		NucRevComp(in)
 	case "-reverse":


=====================================
xtract.go
=====================================
@@ -276,7 +276,7 @@ Command Generator
   Descriptors      INSDSeq_sequence INSDSeq_definition INSDSeq_division
   Flags            [complete|partial]
   Feature(s)       CDS,mRNA
-  Qualifiers       INSDFeature_key "#INSDInterval" gene product sub_sequence
+  Qualifiers       INSDFeature_key "#INSDInterval" gene product feat_location sub_sequence
 
 Variation Processing
 
@@ -6340,6 +6340,66 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 				acc = append(acc, "-deq", "\"\\t\"")
 			}
 
+		} else if str == "feat_location" {
+
+			// special feat_location qualifier shows feature intervals
+			acc = append(acc, "-block", "INSDFeature_intervals")
+
+			acc = append(acc, "-subset", "INSDInterval", "-FR", "INSDInterval_from", "-TO", "INSDInterval_to")
+			if isPipe {
+				acc = append(acc, "-pfx", "", "-tab", "..", "-element", "&FR")
+				acc = append(acc, "-pfx", "", "-tab", ",", "-element", "&TO")
+			} else {
+				acc = append(acc, "-pfx", "\"\"", "-tab", "\"..\"", "-element", "\"&FR\"")
+				acc = append(acc, "-pfx", "\"\"", "-tab", "\",\"", "-element", "\"&TO\"")
+			}
+
+			acc = append(acc, "-subset", "INSDFeature_intervals")
+			if isPipe {
+				acc = append(acc, "-deq", "\\t")
+			} else {
+				acc = append(acc, "-deq", "\"\\t\"")
+			}
+
+
+		} else if str == "chloroplast" ||
+			str == "chromoplast" ||
+			str == "cyanelle" ||
+			str == "environmental_sample" ||
+			str == "focus" ||
+			str == "germline" ||
+			str == "kinetoplast" ||
+			str == "macronuclear" ||
+			str == "metagenomic" ||
+			str == "mitochondrion" ||
+			str == "partial" ||
+			str == "proviral" ||
+			str == "pseudo" ||
+			str == "rearranged" ||
+			str == "ribosomal_slippage" ||
+			str == "trans_splicing" ||
+			str == "transgenic" ||
+			str == "virion" {
+
+			acc = append(acc, "-block", "INSDQualifier")
+
+			checkAgainstVocabulary(str, "qualifier", qualifiers)
+			if doIndex {
+				acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
+				acc = append(acc, "-clr", "-indices", "INSDQualifier_name")
+			} else {
+				acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
+				acc = append(acc, "-lbl", str)
+			}
+			if addDash {
+				acc = append(acc, "-block", "INSDFeature", "-unless", "INSDQualifier_name", "-equals", str)
+				if isPipe {
+					acc = append(acc, "-lbl", "\\-")
+				} else {
+					acc = append(acc, "-lbl", "\"\\-\"")
+				}
+			}
+
 		} else {
 
 			acc = append(acc, "-block", "INSDQualifier")



View it on GitLab: https://salsa.debian.org/med-team/ncbi-entrez-direct/-/compare/fe1e6da39bf57ea0e3c70fcb07425e77995ca363...baad3cc2c8cb9ac3914b3f86b251d204ec06c551

-- 
View it on GitLab: https://salsa.debian.org/med-team/ncbi-entrez-direct/-/compare/fe1e6da39bf57ea0e3c70fcb07425e77995ca363...baad3cc2c8cb9ac3914b3f86b251d204ec06c551
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210212/91c5eb67/attachment-0001.html>


More information about the debian-med-commit mailing list