[med-svn] [ncbi-entrez-direct] 01/08: New upstream version 6.90.20170705+ds

Aaron M. Ucko ucko at moszumanska.debian.org
Thu Jul 6 02:16:17 UTC 2017


This is an automated email from the git hooks/post-receive script.

ucko pushed a commit to branch master
in repository ncbi-entrez-direct.

commit 1b48ede6b93841101bf7b93b8e47307d890fbe8d
Author: Aaron M. Ucko <ucko at debian.org>
Date:   Wed Jul 5 21:31:52 2017 -0400

    New upstream version 6.90.20170705+ds
---
 edirect.pl |  28 +++-
 nquire     |   3 +
 xtract.go  | 422 +++++++++++++++++++++++++++++++++++++++++++------------------
 3 files changed, 324 insertions(+), 129 deletions(-)

diff --git a/edirect.pl b/edirect.pl
index 7c59db4..d888659 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -87,7 +87,7 @@ use constant true  => 1;
 
 # EDirect version number
 
-$version = "6.80";
+$version = "6.90";
 
 # URL address components
 
@@ -2140,7 +2140,7 @@ sub eftch {
     }
 
     # use larger chunk for UID format
-    $chunk = 1000;
+    $chunk = 5000;
     for ( $start = $min; $start < $max; $start += $chunk ) {
 
       my @ids = get_uids ( $dbase, $web, $key, $start, $chunk, $max, $tool, $email );
@@ -2184,7 +2184,7 @@ sub eftch {
     }
 
     # use larger chunk for URL format
-    $chunk = 1000;
+    $chunk = 2000;
     for ( $start = $min; $start < $max; $start += $chunk ) {
 
       my @ids = get_uids ( $dbase, $web, $key, $start, $chunk, $max, $tool, $email );
@@ -2228,7 +2228,7 @@ sub eftch {
     }
 
     # use larger chunk for URL format
-    $chunk = 1000;
+    $chunk = 2000;
     for ( $start = $min; $start < $max; $start += $chunk ) {
 
       my @ids = get_uids ( $dbase, $web, $key, $start, $chunk, $max, $tool, $email );
@@ -2348,6 +2348,18 @@ sub eftch {
 
   # use small chunk because fetched records could be quite large
   $chunk = 100;
+
+  # use larger chunk for accessions
+  if ( $dbase eq "nucleotide" or
+       $dbase eq "nuccore" or
+       $dbase eq "est" or
+       $dbase eq "gss" or
+       $dbase eq "protein" ) {
+    if ( $type eq "ACCN" or $type eq "accn" or $type eq "ACC" or $type eq "acc" ) {
+      $chunk = 4000;
+    }
+  }
+
   for ( $start = $min; $start < $max; $start += $chunk ) {
     $url = $base . $efetch;
 
@@ -4464,11 +4476,13 @@ sub esrch {
   $key = "";
   $num = "";
   $err = "";
+  my $trn = "";
 
   $web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
   $key = $1 if ($output =~ /<QueryKey>(\S+)<\/QueryKey>/);
   $num = $1 if ($output =~ /<Count>(\S+)<\/Count>/);
   $err = $1 if ($output =~ /<Error>(.+?)<\/Error>/i);
+  $trn = $1 if ($output =~ /<QueryTranslation>(.+?)<\/QueryTranslation>/i);
 
   if ( $err ne "" ) {
     write_edirect ( "", "", "", "", "", $err, "", "" );
@@ -4499,6 +4513,12 @@ sub esrch {
   }
 
   write_edirect ( $dbase, $web, $key, $num, $stp, $err, $tool, $email );
+
+  if ( $log ) {
+    if ( $trn ne "" ) {
+      print STDERR "$trn\n";
+    }
+  }
 }
 
 #  eaddr returns the current user's e-mail address
diff --git a/nquire b/nquire
index 68777b0..389f85c 100755
--- a/nquire
+++ b/nquire
@@ -245,6 +245,9 @@ Examples
   nquire -url "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" elink.fcgi \\
     -dbfrom protein -db protein -cmd neighbor -linkname protein_protein -id NP_476532.1
 
+  nquire -eutils esearch.fcgi -db pubmed -term "transposition immunity Tn3" |
+  xtract -pattern eSearchResult -element QueryTranslation
+
 };
 
 sub nquire {
diff --git a/xtract.go b/xtract.go
index ef609f1..48c0aae 100644
--- a/xtract.go
+++ b/xtract.go
@@ -29,7 +29,14 @@
 // ==========================================================================
 
 /*
-  test for presence of go compiler, cross-compile xtract executables, and pack into archive, by running:
+  Download external GO libraries by running:
+
+  cd "$GOPATH"
+  go get -u golang.org/x/text/runes
+  go get -u golang.org/x/text/transform
+  go get -u golang.org/x/text/unicode/norm
+
+  Test for presence of go compiler, cross-compile xtract executables, and pack into archive, by running:
 
   if hash go 2>/dev/null
   then
@@ -71,7 +78,7 @@ import (
 
 // VERSION AND HELP MESSAGE TEXT
 
-const xtractVersion = "6.80"
+const xtractVersion = "6.90"
 
 const xtractHelp = `
 Overview
@@ -84,14 +91,15 @@ Overview
 
   -group, -block, and -subset limit element exploration to selected XML subregions.
 
-Processing
+Processing Flags
 
-  -cleanup         Fix non-ASCII spaces
   -compress        Compress runs of spaces
-  -plain           Delete Unicode accents
-  -relaxed         Allow PubMed mixed content
+
+  -mixed           Allow PubMed mixed content
   -strict          Remove HTML highlight tags
 
+  -accent          Delete Unicode accents
+
 Data Source
 
   -input           Read XML from file instead of stdin
@@ -388,6 +396,10 @@ PubMed Archive Retrieval
 `
 
 const xtractAdvanced = `
+Processing Flag
+
+  -cleanup    Fix non-ASCII spaces
+
 Processing Commands
 
   -prepare    [release|report] Compare daily update to stash
@@ -1053,32 +1065,38 @@ Processing in Groups
 Phrase Indexing
 
   efetch -db pubmed -id 12857958,2981625 -format xml |
-  xtract -head "<Set>" -tail "</Set>" -hd "<Rec>" -tl "</Rec>" \
+  xtract -head "<IdxDocumentSet>" -tail "</IdxDocumentSet>" \
+    -hd "  <IdxDocument>\n" -tl "  </IdxDocument>" \
     -pattern PubmedArticle \
-      -pfx "<Id>" -sfx "</Id>" -element MedlineCitation/PMID \
-      -rst -indices ArticleTitle,AbstractText,Keyword |
-  xtract -pattern Rec -UID Id \
-    -block Term -pfc "\n" -element "&UID",Term
+      -pfx "    <IdxUid>" -sfx "</IdxUid>\n" \
+      -element MedlineCitation/PMID \
+      -clr -rst -tab "\n" \
+      -lbl "    <IdxSearchFields>" \
+      -indices ArticleTitle,AbstractText,Keyword \
+      -clr -lbl "    </IdxSearchFields>\n" |
+  xtract -pattern IdxDocument -UID IdxUid \
+    -block NORM -pfc "\n" -element "&UID",NORM \
+    -block PAIR -pfc "\n" -element "&UID",PAIR
 
   12857958    allow
-  12857958    allow topo
   12857958    assays
   12857958    binding
-  12857958    binding assays
   12857958    braid
-  12857958    braid relaxation
-  12857958    braid supercoil
   12857958    braiding
-  12857958    braiding system
   12857958    carlo
-  12857958    carlo simulations
   12857958    catenane
-  12857958    catenane configurations
   12857958    chiral
-  12857958    chiral crossings
-  12857958    chiral preference
   12857958    chirality
-  12857958    chirality sensing
+  ...
+  12857958    type
+  12857958    underlying
+  12857958    writhe
+  12857958    allow topo
+  12857958    binding assays
+  12857958    braid relaxation
+  12857958    braid supercoil
+  12857958    braiding system
+  12857958    carlo simulations
   ...
 
 Phrase Searching
@@ -1900,6 +1918,17 @@ var markupRunes = map[rune]rune{
 	'\u208E': ')',
 }
 
+var accentRunes = map[rune]rune{
+	'\u00D8': 'O',
+	'\u00F0': 'd',
+	'\u00F8': 'o',
+	'\u0111': 'd',
+	'\u0131': 'i',
+	'\u0141': 'L',
+	'\u0142': 'l',
+	'\u02BC': '\'',
+}
+
 var argTypeIs = map[string]ArgumentType{
 	"-unit":        EXPLORATION,
 	"-Unit":        EXPLORATION,
@@ -2061,6 +2090,8 @@ var levelTypeIs = map[string]LevelType{
 	"-Pattern":  PATTERN,
 }
 
+var slock sync.RWMutex
+
 var sequenceTypeIs = map[string]SequenceType{
 	"INSDSeq:INSDInterval_from":       {1, ISSTART},
 	"INSDSeq:INSDInterval_to":         {1, ISSTOP},
@@ -2092,7 +2123,41 @@ var sequenceTypeIs = map[string]SequenceType{
 	"Rs:@structLoc":                   {0, ISPOS},
 }
 
+var plock sync.RWMutex
+
 var isStopWord = map[string]bool{
+	"!":             true,
+	"\"":            true,
+	"#":             true,
+	"$":             true,
+	"%":             true,
+	"&":             true,
+	"'":             true,
+	"(":             true,
+	")":             true,
+	"*":             true,
+	"+":             true,
+	",":             true,
+	"-":             true,
+	".":             true,
+	"/":             true,
+	":":             true,
+	";":             true,
+	"<":             true,
+	"=":             true,
+	">":             true,
+	"?":             true,
+	"@":             true,
+	"[":             true,
+	"\\":            true,
+	"]":             true,
+	"^":             true,
+	"_":             true,
+	"`":             true,
+	"{":             true,
+	"|":             true,
+	"}":             true,
+	"~":             true,
 	"a":             true,
 	"about":         true,
 	"again":         true,
@@ -2442,6 +2507,23 @@ func TrimPunctuation(str string) string {
 		if str[0] == '(' && str[max-1] == ')' {
 			// trim flanking parentheses
 			str = str[1 : max-1]
+			max -= 2
+		}
+	}
+
+	if max > 0 {
+		if str[0] == '(' && !strings.Contains(str, ")") {
+			// trim isolated left parentheses
+			str = str[1:]
+			max--
+		}
+	}
+
+	if max > 1 {
+		if str[max-1] == ')' && !strings.Contains(str, "(") {
+			// trim isolated right parentheses
+			str = str[:max-1]
+			// max--
 		}
 	}
 
@@ -2580,6 +2662,7 @@ func HasMarkup(str string) bool {
 		if ch <= 127 {
 			continue
 		}
+		// quick min-to-max check for Unicode superscript or subscript characters
 		if (ch >= '\u00B2' && ch <= '\u00B9') || (ch >= '\u2070' && ch <= '\u208E') {
 			return true
 		}
@@ -2716,18 +2799,93 @@ func ParseFlag(str string) OpType {
 	return UNSET
 }
 
-func HtmlReplacer() *strings.Replacer {
+var (
+	rlock sync.Mutex
+	replr *strings.Replacer
+)
+
+func DoHtmlReplace(str string) string {
+
+	// replacer not reentrant, protected by mutex
+	rlock.Lock()
+
+	if replr == nil {
+		replr = strings.NewReplacer("<i>", "", "</i>", "", "<i/>", "", "<i />", "",
+			"<b>", "", "</b>", "", "<b/>", "", "<b />", "",
+			"<u>", "", "</u>", "", "<u/>", "", "<u />", "",
+			"<sub>", "", "</sub>", "", "<sub/>", "", "<sub />", "",
+			"<sup>", "", "</sup>", "", "<sup/>", "", "<sup />", "")
+	}
+
+	if replr != nil {
+		str = replr.Replace(str)
+	}
+
+	rlock.Unlock()
+
+	return str
+}
+
+func HasBadAccent(str string) bool {
+
+	for _, ch := range str {
+		if ch <= 127 {
+			continue
+		}
+		// quick min-to-max check for additional characters to treat as accents
+		if ch >= '\u00D8' && ch <= '\u02BC' {
+			return true
+		}
+	}
+
+	return false
+}
+
+func FixBadAccent(str string) string {
+
+	var buffer bytes.Buffer
+
+	for _, ch := range str {
+		if ch > 127 {
+			if ch >= '\u00D8' && ch <= '\u02BC' {
+				rn, ok := accentRunes[ch]
+				if ok {
+					ch = rn
+				}
+			}
+		}
+		buffer.WriteRune(ch)
+	}
 
-	return strings.NewReplacer("<i>", "", "</i>", "", "<i/>", "", "<i />", "",
-		"<b>", "", "</b>", "", "<b/>", "", "<b />", "",
-		"<u>", "", "</u>", "", "<u/>", "", "<u />", "",
-		"<sub>", "", "</sub>", "", "<sub/>", "", "<sub />", "",
-		"<sup>", "", "</sup>", "", "<sup/>", "", "<sup />", "")
+	return buffer.String()
 }
 
-func AccentTransformer() transform.Transformer {
+var (
+	tlock sync.Mutex
+	tform transform.Transformer
+)
+
+func DoAccentTransform(str string) string {
+
+	// transformer not reentrant, protected by mutex
+	tlock.Lock()
+
+	if tform == nil {
+		tform = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+	}
+
+	if tform != nil {
+		str, _, _ = transform.String(tform, str)
+	}
+
+	tlock.Unlock()
+
+	// look for characters not in current external runes conversion table
+	if HasBadAccent(str) {
+		str = FixBadAccent(str)
+	}
 
-	return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+	return str
 }
 
 // CREATE COMMON DRIVER TABLES
@@ -3322,7 +3480,9 @@ func ParseArguments(args []string, pttrn string) *Block {
 						seq += match
 					}
 					// confirm -0-based or -1-based arguments are known sequence position elements or attributes
+					slock.RLock()
 					seqtype, ok := sequenceTypeIs[seq]
+					slock.RUnlock()
 					if !ok {
 						fmt.Fprintf(os.Stderr, "\nERROR: Element '%s' is not suitable for sequence coordinate conversion\n", item)
 						os.Exit(1)
@@ -4275,7 +4435,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					idx++
 					return STARTTAG, str[:], atr[:], Line, idx
 				default:
-					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element, line %d\n", line)
+					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element, line %d\n", ch, line)
 					return STARTTAG, str[:], "", Line, idx
 				}
 
@@ -4304,7 +4464,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 
 						return STOPTAG, str[:], "", Line, idx
 					}
-					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element, line %d\n", line)
+					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element, line %d\n", ch, line)
 				case '?':
 					// skip ?xml and ?processing instructions
 					idx++
@@ -4320,8 +4480,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					idx++
 					start = idx
 					ch = text[idx]
-					Which := NOTAG
-					SkipTo := ""
+					Which = NOTAG
+					SkipTo = ""
 					if ch == '[' && strings.HasPrefix(text[idx:], "[CDATA[") {
 						Which = CDATATAG
 						SkipTo = "]]>"
@@ -4389,7 +4549,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					idx++
 					return NOTAG, "", "", Line, idx
 				default:
-					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element, line %d\n", line)
+					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element, line %d\n", ch, line)
 				}
 			}
 
@@ -4773,8 +4933,6 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 		inPattern := false
 		prevName := ""
 
-		mn := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
-
 		for {
 			tag, name, attr, _, idx := nextToken(Idx)
 			Idx = idx
@@ -4890,7 +5048,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					case DOSHRINK:
 						name = CompressRunsOfSpaces(name)
 					case DOACCENT:
-						name, _, _ = transform.String(mn, name)
+						if IsNotASCII(name) {
+							name = DoAccentTransform(name)
+						}
 					default:
 						continue
 					}
@@ -4918,7 +5078,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					case DOSHRINK:
 						name = CompressRunsOfSpaces(name)
 					case DOACCENT:
-						name, _, _ = transform.String(mn, name)
+						if IsNotASCII(name) {
+							name = DoAccentTransform(name)
+						}
 					default:
 						continue
 					}
@@ -5002,15 +5164,6 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 			}
 		}
 
-		var replr *strings.Replacer
-		if tbls.DeGloss {
-			replr = HtmlReplacer()
-		}
-		var mn transform.Transformer
-		if tbls.DeAccent {
-			mn = AccentTransformer()
-		}
-
 		// copy with processing flags
 		if copyRecrd {
 
@@ -5024,8 +5177,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					if HasMarkup(str) {
 						str = RemoveUnicodeMarkup(str)
 					}
-					if HasAngleBracket(str) && replr != nil {
-						str = replr.Replace(str)
+					if HasAngleBracket(str) {
+						str = DoHtmlReplace(str)
 					}
 				}
 				if tbls.DoMixed {
@@ -5034,8 +5187,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 					}
 				}
 				if tbls.DeAccent {
-					if IsNotASCII(str) && mn != nil {
-						str, _, _ = transform.String(mn, str)
+					if IsNotASCII(str) {
+						str = DoAccentTransform(str)
 					}
 				}
 
@@ -5385,8 +5538,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 						if HasMarkup(name) {
 							name = RemoveUnicodeMarkup(name)
 						}
-						if HasAngleBracket(name) && replr != nil {
-							name = replr.Replace(name)
+						if HasAngleBracket(name) {
+							name = DoHtmlReplace(name)
 						}
 					}
 					if tbls.DoMixed {
@@ -5395,8 +5548,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 						}
 					}
 					if tbls.DeAccent {
-						if IsNotASCII(name) && mn != nil {
-							name, _, _ = transform.String(mn, name)
+						if IsNotASCII(name) {
+							name = DoAccentTransform(name)
 						}
 					}
 					if HasFlankingSpace(name) {
@@ -5941,17 +6094,28 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 
 	if doIndex {
 		if isPipe {
-			acc = append(acc, "-head", "<Set>", "-tail", "</Set>", "-hd", "<Rec>", "-tl", "</Rec>")
-			acc = append(acc, "-pattern", "INSDSeq", "-pfx", "<Id>", "-sfx", "</Id>")
-			acc = append(acc, "-element", "INSDSeq_accession-version")
+			acc = append(acc, "-head", "<IdxDocumentSet>", "-tail", "</IdxDocumentSet>")
+			acc = append(acc, "-hd", "  <IdxDocument>\n", "-tl", "  </IdxDocument>")
+			acc = append(acc, "-pattern", "INSDSeq", "-pfx", "    <IdxUid>", "-sfx", "</IdxUid>\n")
+			acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\n")
 		} else {
-			acc = append(acc, "-head", "\"<Set>\"", "-tail", "\"</Set>\"", "-hd", "\"<Rec>\"", "-tl", "\"</Rec>\"")
-			acc = append(acc, "-pattern", "INSDSeq", "-pfx", "\"<Id>\"", "-sfx", "\"</Id>\"")
-			acc = append(acc, "-element", "INSDSeq_accession-version")
+			acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"")
+			acc = append(acc, "-hd", "\"  <IdxDocument>\\n\"", "-tl", "\"  </IdxDocument>\"")
+			acc = append(acc, "-pattern", "INSDSeq", "-pfx", "\"    <IdxUid>\"", "-sfx", "\"</IdxUid>\\n\"")
+			acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\\n")
 		}
 	} else {
 		acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version")
 	}
+
+	if doIndex {
+		if isPipe {
+			acc = append(acc, "-group", "INSDSeq", "-lbl", "    <IdxSearchFields>\n")
+		} else {
+			acc = append(acc, "-group", "INSDSeq", "-lbl", "\"    <IdxSearchFields>\\n\"")
+		}
+	}
+
 	printAccn := true
 
 	// collect descriptors
@@ -5959,7 +6123,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 	if strings.HasPrefix(args[0], "INSD") {
 
 		if doIndex {
-			acc = append(acc, "-rst", "-indices")
+			acc = append(acc, "-clr", "-indices")
 		} else {
 			if isPipe {
 				acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN")
@@ -6065,7 +6229,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 
 			checkAgainstVocabulary(str, "element", insdtags)
 			if doIndex {
-				acc = append(acc, "-block", "INSDFeature", "-indices")
+				acc = append(acc, "-block", "INSDFeature", "-clr", "-indices")
 			} else {
 				if isPipe {
 					acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
@@ -6095,7 +6259,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 
 			checkAgainstVocabulary(str, "element", insdtags)
 			if doIndex {
-				acc = append(acc, "-block", "INSDFeature", "-indices")
+				acc = append(acc, "-block", "INSDFeature", "-clr", "-indices")
 			} else {
 				if isPipe {
 					acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
@@ -6121,9 +6285,9 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 				acc = append(acc, "-if", "INSDQualifier_name", "-equals", str[1:])
 				if doIndex {
 					if isPipe {
-						acc = append(acc, "-indices", "%INSDQualifier_value")
+						acc = append(acc, "-clr", "-indices", "%INSDQualifier_value")
 					} else {
-						acc = append(acc, "-indices", "\"%INSDQualifier_value\"")
+						acc = append(acc, "-clr", "-indices", "\"%INSDQualifier_value\"")
 					}
 				} else {
 					if isPipe {
@@ -6143,7 +6307,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 			} else {
 				if doIndex {
 					acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
-					acc = append(acc, "-indices", "INSDQualifier_value")
+					acc = append(acc, "-clr", "-indices", "INSDQualifier_value")
 				} else {
 					acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
 					acc = append(acc, "-element", "INSDQualifier_value")
@@ -6160,6 +6324,14 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
 		}
 	}
 
+	if doIndex {
+		if isPipe {
+			acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", "    </IdxSearchFields>\n")
+		} else {
+			acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", "\"    </IdxSearchFields>\\n\"")
+		}
+	}
+
 	return acc
 }
 
@@ -6784,7 +6956,10 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
 					past := ""
 					for _, item := range words {
 						item = strings.ToLower(item)
-						if isStopWord[item] {
+						plock.RLock()
+						isSW := isStopWord[item]
+						plock.RUnlock()
+						if isSW {
 							past = ""
 							continue
 						}
@@ -6811,30 +6986,39 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
 			}
 		})
 	case INDICES:
-		var tm []string
-
-		mn := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+		var term []string
+		var pair []string
 
 		addToIndex := func(item, past string) string {
 
+			if IsNotASCII(item) {
+				item = DoAccentTransform(item)
+			}
 			item = strings.ToLower(item)
-			if IsNotASCII(item) && mn != nil {
-				item, _, _ = transform.String(mn, item)
+			if HasBadSpace(item) {
+				item = CleanupBadSpaces(item)
 			}
 			if HasMarkup(item) {
 				item = RemoveUnicodeMarkup(item)
 			}
 			item = TrimPunctuation(item)
-			if item == "" || isStopWord[item] {
+			if item == "" {
+				return ""
+			}
+			plock.RLock()
+			isSW := isStopWord[item]
+			plock.RUnlock()
+			if isSW {
 				// skip if stop word, interrupts overlapping word pair chain
 				return ""
 			}
 			ok = true
+			item = html.EscapeString(item)
 			// add single term
-			tm = append(tm, item)
+			term = append(term, item)
 			if past != "" {
 				// add informative adjacent word pair
-				tm = append(tm, past+" "+item)
+				pair = append(pair, past+" "+item)
 			}
 
 			return item
@@ -6861,18 +7045,31 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
 		})
 
 		if ok {
-			// sort array of words and pairs
-			sort.Slice(tm, func(i, j int) bool { return tm[i] < tm[j] })
+			// sort arrays of words and pairs
+			sort.Slice(term, func(i, j int) bool { return term[i] < term[j] })
+			sort.Slice(pair, func(i, j int) bool { return pair[i] < pair[j] })
 
 			last := ""
-			for _, item := range tm {
+			for _, item := range term {
 				if item == last {
 					// skip duplicate entry
 					continue
 				}
-				buffer.WriteString("<Term>")
+				buffer.WriteString("      <NORM>")
 				buffer.WriteString(item)
-				buffer.WriteString("</Term>")
+				buffer.WriteString("</NORM>\n")
+				last = item
+			}
+
+			last = ""
+			for _, item := range pair {
+				if item == last {
+					// skip duplicate entry
+					continue
+				}
+				buffer.WriteString("      <PAIR>")
+				buffer.WriteString(item)
+				buffer.WriteString("</PAIR>\n")
 				last = item
 			}
 		}
@@ -7579,19 +7776,12 @@ func ProcessCommands(cmds *Block, curr *Node, tab, ret string, index, level int,
 // PROCESS ONE XML COMPONENT RECORD
 
 // ProcessQuery calls XML combined tokenizer parser on a partitioned string
-func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, replr *strings.Replacer, mn transform.Transformer, action SpecialType) string {
+func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, action SpecialType) string {
 
 	if Text == "" || tbls == nil {
 		return ""
 	}
 
-	if tbls.DeGloss && replr == nil {
-		replr = HtmlReplacer()
-	}
-	if tbls.DeAccent && mn == nil {
-		mn = AccentTransformer()
-	}
-
 	// node farm variables
 	FarmPos := 0
 	FarmMax := tbls.FarmSize
@@ -7713,7 +7903,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
 					idx++
 					return STARTTAG, str[:], atr[:], idx
 				default:
-					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
+					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element\n", ch)
 					return STARTTAG, str[:], "", idx
 				}
 
@@ -7742,7 +7932,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
 
 						return STOPTAG, str[:], "", idx
 					}
-					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
+					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element\n", ch)
 				case '?':
 					// skip ?xml and ?processing instructions
 					idx++
@@ -7786,7 +7976,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
 						idx++
 					}
 				default:
-					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
+					fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element\n", ch)
 				}
 			}
 
@@ -7878,8 +8068,8 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
 					if HasMarkup(name) {
 						name = RemoveUnicodeMarkup(name)
 					}
-					if HasAngleBracket(name) && replr != nil {
-						name = replr.Replace(name)
+					if HasAngleBracket(name) {
+						name = DoHtmlReplace(name)
 					}
 				}
 				if tbls.DoMixed {
@@ -7888,8 +8078,8 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
 					}
 				}
 				if tbls.DeAccent {
-					if IsNotASCII(name) && mn != nil {
-						name, _, _ = transform.String(mn, name)
+					if IsNotASCII(name) {
+						name = DoAccentTransform(name)
 					}
 				}
 				node.Contents = name
@@ -8373,15 +8563,6 @@ func CreateConsumers(cmds *Block, tbls *Tables, parent string, inp <-chan Extrac
 		// report when this consumer has no more records to process
 		defer wg.Done()
 
-		var replr *strings.Replacer
-		if tbls.DeGloss {
-			replr = HtmlReplacer()
-		}
-		var mn transform.Transformer
-		if tbls.DeAccent {
-			mn = AccentTransformer()
-		}
-
 		// read partitioned XML from producer channel
 		for ext := range inp {
 
@@ -8394,7 +8575,7 @@ func CreateConsumers(cmds *Block, tbls *Tables, parent string, inp <-chan Extrac
 				continue
 			}
 
-			str := ProcessQuery(text[:], parent, idx, cmds, tbls, replr, mn, DOQUERY)
+			str := ProcessQuery(text[:], parent, idx, cmds, tbls, DOQUERY)
 
 			// send even if empty to get all record counts for reordering
 			out <- Extract{idx, "", str}
@@ -8436,15 +8617,6 @@ func CreateExaminers(tbls *Tables, parent string, inp <-chan Extract) <-chan Ext
 		// report when this examiner has no more records to process
 		defer wg.Done()
 
-		var replr *strings.Replacer
-		if tbls.DeGloss {
-			replr = HtmlReplacer()
-		}
-		var mn transform.Transformer
-		if tbls.DeAccent {
-			mn = AccentTransformer()
-		}
-
 		// read partitioned XML from producer channel
 		for ext := range inp {
 
@@ -8457,7 +8629,7 @@ func CreateExaminers(tbls *Tables, parent string, inp <-chan Extract) <-chan Ext
 				continue
 			}
 
-			id := ProcessQuery(text[:], parent, 0, nil, tbls, replr, mn, DOINDEX)
+			id := ProcessQuery(text[:], parent, 0, nil, tbls, DOINDEX)
 
 			// send even if empty to get all record counts for reordering
 			out <- Extract{idx, id, text}
@@ -8977,13 +9149,13 @@ func main() {
 		// data cleanup flags
 		case "-compress":
 			doCompress = true
-		case "-cleanup":
+		case "-cleanup", "-spaces":
 			doCleanup = true
 		case "-strict":
 			deGloss = true
-		case "-relaxed":
+		case "-mixed", "-relaxed":
 			doMixed = true
-		case "-plain":
+		case "-accent", "-plain":
 			deAccent = true
 		// debugging flags
 		case "-prepare":
@@ -9745,7 +9917,7 @@ func main() {
 			func(rec int, ofs int64, str string) {
 				recordCount++
 
-				id := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOINDEX)
+				id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
 				if id == "" {
 					return
 				}
@@ -9936,7 +10108,7 @@ func main() {
 			func(rec int, ofs int64, str string) {
 				recordCount++
 
-				id := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOINDEX)
+				id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
 				if id == "" {
 					return
 				}
@@ -9991,7 +10163,7 @@ func main() {
 			func(rec int, ofs int64, str string) {
 				recordCount++
 
-				res := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOVALID)
+				res := ProcessQuery(str[:], parent, rec, nil, tbls, DOVALID)
 				if res == "" {
 					return
 				}
@@ -10034,7 +10206,7 @@ func main() {
 			func(rec int, ofs int64, str string) {
 				recordCount++
 
-				id := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOINDEX)
+				id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
 				if id == "" {
 					return
 				}
@@ -10158,7 +10330,7 @@ func main() {
 		PartitionPattern(topPattern, star, rdr,
 			func(rec int, ofs int64, str string) {
 				beginTime := time.Now()
-				ProcessQuery(str[:], parent, rec, cmds, tbls, nil, nil, DOQUERY)
+				ProcessQuery(str[:], parent, rec, cmds, tbls, DOQUERY)
 				endTime := time.Now()
 				duration := endTime.Sub(beginTime)
 				micro := int(float64(duration.Nanoseconds()) / 1e3)
@@ -10362,7 +10534,7 @@ func main() {
 		cmds.Position = ""
 
 		// process single selected record
-		res := ProcessQuery(qry[:], parent, idx, cmds, tbls, nil, nil, DOQUERY)
+		res := ProcessQuery(qry[:], parent, idx, cmds, tbls, DOQUERY)
 
 		if res != "" {
 			fmt.Printf("%s\n", res)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ncbi-entrez-direct.git



More information about the debian-med-commit mailing list