[med-svn] [ncbi-entrez-direct] 01/08: New upstream version 6.90.20170705+ds
Aaron M. Ucko
ucko at moszumanska.debian.org
Thu Jul 6 02:16:17 UTC 2017
This is an automated email from the git hooks/post-receive script.
ucko pushed a commit to branch master
in repository ncbi-entrez-direct.
commit 1b48ede6b93841101bf7b93b8e47307d890fbe8d
Author: Aaron M. Ucko <ucko at debian.org>
Date: Wed Jul 5 21:31:52 2017 -0400
New upstream version 6.90.20170705+ds
---
edirect.pl | 28 +++-
nquire | 3 +
xtract.go | 422 +++++++++++++++++++++++++++++++++++++++++++------------------
3 files changed, 324 insertions(+), 129 deletions(-)
diff --git a/edirect.pl b/edirect.pl
index 7c59db4..d888659 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -87,7 +87,7 @@ use constant true => 1;
# EDirect version number
-$version = "6.80";
+$version = "6.90";
# URL address components
@@ -2140,7 +2140,7 @@ sub eftch {
}
# use larger chunk for UID format
- $chunk = 1000;
+ $chunk = 5000;
for ( $start = $min; $start < $max; $start += $chunk ) {
my @ids = get_uids ( $dbase, $web, $key, $start, $chunk, $max, $tool, $email );
@@ -2184,7 +2184,7 @@ sub eftch {
}
# use larger chunk for URL format
- $chunk = 1000;
+ $chunk = 2000;
for ( $start = $min; $start < $max; $start += $chunk ) {
my @ids = get_uids ( $dbase, $web, $key, $start, $chunk, $max, $tool, $email );
@@ -2228,7 +2228,7 @@ sub eftch {
}
# use larger chunk for URL format
- $chunk = 1000;
+ $chunk = 2000;
for ( $start = $min; $start < $max; $start += $chunk ) {
my @ids = get_uids ( $dbase, $web, $key, $start, $chunk, $max, $tool, $email );
@@ -2348,6 +2348,18 @@ sub eftch {
# use small chunk because fetched records could be quite large
$chunk = 100;
+
+ # use larger chunk for accessions
+ if ( $dbase eq "nucleotide" or
+ $dbase eq "nuccore" or
+ $dbase eq "est" or
+ $dbase eq "gss" or
+ $dbase eq "protein" ) {
+ if ( $type eq "ACCN" or $type eq "accn" or $type eq "ACC" or $type eq "acc" ) {
+ $chunk = 4000;
+ }
+ }
+
for ( $start = $min; $start < $max; $start += $chunk ) {
$url = $base . $efetch;
@@ -4464,11 +4476,13 @@ sub esrch {
$key = "";
$num = "";
$err = "";
+ my $trn = "";
$web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
$key = $1 if ($output =~ /<QueryKey>(\S+)<\/QueryKey>/);
$num = $1 if ($output =~ /<Count>(\S+)<\/Count>/);
$err = $1 if ($output =~ /<Error>(.+?)<\/Error>/i);
+ $trn = $1 if ($output =~ /<QueryTranslation>(.+?)<\/QueryTranslation>/i);
if ( $err ne "" ) {
write_edirect ( "", "", "", "", "", $err, "", "" );
@@ -4499,6 +4513,12 @@ sub esrch {
}
write_edirect ( $dbase, $web, $key, $num, $stp, $err, $tool, $email );
+
+ if ( $log ) {
+ if ( $trn ne "" ) {
+ print STDERR "$trn\n";
+ }
+ }
}
# eaddr returns the current user's e-mail address
diff --git a/nquire b/nquire
index 68777b0..389f85c 100755
--- a/nquire
+++ b/nquire
@@ -245,6 +245,9 @@ Examples
nquire -url "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" elink.fcgi \\
-dbfrom protein -db protein -cmd neighbor -linkname protein_protein -id NP_476532.1
+ nquire -eutils esearch.fcgi -db pubmed -term "transposition immunity Tn3" |
+ xtract -pattern eSearchResult -element QueryTranslation
+
};
sub nquire {
diff --git a/xtract.go b/xtract.go
index ef609f1..48c0aae 100644
--- a/xtract.go
+++ b/xtract.go
@@ -29,7 +29,14 @@
// ==========================================================================
/*
- test for presence of go compiler, cross-compile xtract executables, and pack into archive, by running:
+ Download external GO libraries by running:
+
+ cd "$GOPATH"
+ go get -u golang.org/x/text/runes
+ go get -u golang.org/x/text/transform
+ go get -u golang.org/x/text/unicode/norm
+
+ Test for presence of go compiler, cross-compile xtract executables, and pack into archive, by running:
if hash go 2>/dev/null
then
@@ -71,7 +78,7 @@ import (
// VERSION AND HELP MESSAGE TEXT
-const xtractVersion = "6.80"
+const xtractVersion = "6.90"
const xtractHelp = `
Overview
@@ -84,14 +91,15 @@ Overview
-group, -block, and -subset limit element exploration to selected XML subregions.
-Processing
+Processing Flags
- -cleanup Fix non-ASCII spaces
-compress Compress runs of spaces
- -plain Delete Unicode accents
- -relaxed Allow PubMed mixed content
+
+ -mixed Allow PubMed mixed content
-strict Remove HTML highlight tags
+ -accent Delete Unicode accents
+
Data Source
-input Read XML from file instead of stdin
@@ -388,6 +396,10 @@ PubMed Archive Retrieval
`
const xtractAdvanced = `
+Processing Flag
+
+ -cleanup Fix non-ASCII spaces
+
Processing Commands
-prepare [release|report] Compare daily update to stash
@@ -1053,32 +1065,38 @@ Processing in Groups
Phrase Indexing
efetch -db pubmed -id 12857958,2981625 -format xml |
- xtract -head "<Set>" -tail "</Set>" -hd "<Rec>" -tl "</Rec>" \
+ xtract -head "<IdxDocumentSet>" -tail "</IdxDocumentSet>" \
+ -hd " <IdxDocument>\n" -tl " </IdxDocument>" \
-pattern PubmedArticle \
- -pfx "<Id>" -sfx "</Id>" -element MedlineCitation/PMID \
- -rst -indices ArticleTitle,AbstractText,Keyword |
- xtract -pattern Rec -UID Id \
- -block Term -pfc "\n" -element "&UID",Term
+ -pfx " <IdxUid>" -sfx "</IdxUid>\n" \
+ -element MedlineCitation/PMID \
+ -clr -rst -tab "\n" \
+ -lbl " <IdxSearchFields>" \
+ -indices ArticleTitle,AbstractText,Keyword \
+ -clr -lbl " </IdxSearchFields>\n" |
+ xtract -pattern IdxDocument -UID IdxUid \
+ -block NORM -pfc "\n" -element "&UID",NORM \
+ -block PAIR -pfc "\n" -element "&UID",PAIR
12857958 allow
- 12857958 allow topo
12857958 assays
12857958 binding
- 12857958 binding assays
12857958 braid
- 12857958 braid relaxation
- 12857958 braid supercoil
12857958 braiding
- 12857958 braiding system
12857958 carlo
- 12857958 carlo simulations
12857958 catenane
- 12857958 catenane configurations
12857958 chiral
- 12857958 chiral crossings
- 12857958 chiral preference
12857958 chirality
- 12857958 chirality sensing
+ ...
+ 12857958 type
+ 12857958 underlying
+ 12857958 writhe
+ 12857958 allow topo
+ 12857958 binding assays
+ 12857958 braid relaxation
+ 12857958 braid supercoil
+ 12857958 braiding system
+ 12857958 carlo simulations
...
Phrase Searching
@@ -1900,6 +1918,17 @@ var markupRunes = map[rune]rune{
'\u208E': ')',
}
+var accentRunes = map[rune]rune{
+ '\u00D8': 'O',
+ '\u00F0': 'd',
+ '\u00F8': 'o',
+ '\u0111': 'd',
+ '\u0131': 'i',
+ '\u0141': 'L',
+ '\u0142': 'l',
+ '\u02BC': '\'',
+}
+
var argTypeIs = map[string]ArgumentType{
"-unit": EXPLORATION,
"-Unit": EXPLORATION,
@@ -2061,6 +2090,8 @@ var levelTypeIs = map[string]LevelType{
"-Pattern": PATTERN,
}
+var slock sync.RWMutex
+
var sequenceTypeIs = map[string]SequenceType{
"INSDSeq:INSDInterval_from": {1, ISSTART},
"INSDSeq:INSDInterval_to": {1, ISSTOP},
@@ -2092,7 +2123,41 @@ var sequenceTypeIs = map[string]SequenceType{
"Rs:@structLoc": {0, ISPOS},
}
+var plock sync.RWMutex
+
var isStopWord = map[string]bool{
+ "!": true,
+ "\"": true,
+ "#": true,
+ "$": true,
+ "%": true,
+ "&": true,
+ "'": true,
+ "(": true,
+ ")": true,
+ "*": true,
+ "+": true,
+ ",": true,
+ "-": true,
+ ".": true,
+ "/": true,
+ ":": true,
+ ";": true,
+ "<": true,
+ "=": true,
+ ">": true,
+ "?": true,
+ "@": true,
+ "[": true,
+ "\\": true,
+ "]": true,
+ "^": true,
+ "_": true,
+ "`": true,
+ "{": true,
+ "|": true,
+ "}": true,
+ "~": true,
"a": true,
"about": true,
"again": true,
@@ -2442,6 +2507,23 @@ func TrimPunctuation(str string) string {
if str[0] == '(' && str[max-1] == ')' {
// trim flanking parentheses
str = str[1 : max-1]
+ max -= 2
+ }
+ }
+
+ if max > 0 {
+ if str[0] == '(' && !strings.Contains(str, ")") {
+ // trim isolated left parentheses
+ str = str[1:]
+ max--
+ }
+ }
+
+ if max > 1 {
+ if str[max-1] == ')' && !strings.Contains(str, "(") {
+ // trim isolated right parentheses
+ str = str[:max-1]
+ // max--
}
}
@@ -2580,6 +2662,7 @@ func HasMarkup(str string) bool {
if ch <= 127 {
continue
}
+ // quick min-to-max check for Unicode superscript or subscript characters
if (ch >= '\u00B2' && ch <= '\u00B9') || (ch >= '\u2070' && ch <= '\u208E') {
return true
}
@@ -2716,18 +2799,93 @@ func ParseFlag(str string) OpType {
return UNSET
}
-func HtmlReplacer() *strings.Replacer {
+var (
+ rlock sync.Mutex
+ replr *strings.Replacer
+)
+
+func DoHtmlReplace(str string) string {
+
+ // replacer not reentrant, protected by mutex
+ rlock.Lock()
+
+ if replr == nil {
+ replr = strings.NewReplacer("<i>", "", "</i>", "", "<i/>", "", "<i />", "",
+ "<b>", "", "</b>", "", "<b/>", "", "<b />", "",
+ "<u>", "", "</u>", "", "<u/>", "", "<u />", "",
+ "<sub>", "", "</sub>", "", "<sub/>", "", "<sub />", "",
+ "<sup>", "", "</sup>", "", "<sup/>", "", "<sup />", "")
+ }
+
+ if replr != nil {
+ str = replr.Replace(str)
+ }
+
+ rlock.Unlock()
+
+ return str
+}
+
+func HasBadAccent(str string) bool {
+
+ for _, ch := range str {
+ if ch <= 127 {
+ continue
+ }
+ // quick min-to-max check for additional characters to treat as accents
+ if ch >= '\u00D8' && ch <= '\u02BC' {
+ return true
+ }
+ }
+
+ return false
+}
+
+func FixBadAccent(str string) string {
+
+ var buffer bytes.Buffer
+
+ for _, ch := range str {
+ if ch > 127 {
+ if ch >= '\u00D8' && ch <= '\u02BC' {
+ rn, ok := accentRunes[ch]
+ if ok {
+ ch = rn
+ }
+ }
+ }
+ buffer.WriteRune(ch)
+ }
- return strings.NewReplacer("<i>", "", "</i>", "", "<i/>", "", "<i />", "",
- "<b>", "", "</b>", "", "<b/>", "", "<b />", "",
- "<u>", "", "</u>", "", "<u/>", "", "<u />", "",
- "<sub>", "", "</sub>", "", "<sub/>", "", "<sub />", "",
- "<sup>", "", "</sup>", "", "<sup/>", "", "<sup />", "")
+ return buffer.String()
}
-func AccentTransformer() transform.Transformer {
+var (
+ tlock sync.Mutex
+ tform transform.Transformer
+)
+
+func DoAccentTransform(str string) string {
+
+ // transformer not reentrant, protected by mutex
+ tlock.Lock()
+
+ if tform == nil {
+ tform = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+ }
+
+ if tform != nil {
+ str, _, _ = transform.String(tform, str)
+ }
+
+ tlock.Unlock()
+
+ // look for characters not in current external runes conversion table
+ if HasBadAccent(str) {
+ str = FixBadAccent(str)
+ }
- return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+ return str
}
// CREATE COMMON DRIVER TABLES
@@ -3322,7 +3480,9 @@ func ParseArguments(args []string, pttrn string) *Block {
seq += match
}
// confirm -0-based or -1-based arguments are known sequence position elements or attributes
+ slock.RLock()
seqtype, ok := sequenceTypeIs[seq]
+ slock.RUnlock()
if !ok {
fmt.Fprintf(os.Stderr, "\nERROR: Element '%s' is not suitable for sequence coordinate conversion\n", item)
os.Exit(1)
@@ -4275,7 +4435,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
idx++
return STARTTAG, str[:], atr[:], Line, idx
default:
- fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element, line %d\n", line)
+ fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element, line %d\n", ch, line)
return STARTTAG, str[:], "", Line, idx
}
@@ -4304,7 +4464,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
return STOPTAG, str[:], "", Line, idx
}
- fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element, line %d\n", line)
+ fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element, line %d\n", ch, line)
case '?':
// skip ?xml and ?processing instructions
idx++
@@ -4320,8 +4480,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
idx++
start = idx
ch = text[idx]
- Which := NOTAG
- SkipTo := ""
+ Which = NOTAG
+ SkipTo = ""
if ch == '[' && strings.HasPrefix(text[idx:], "[CDATA[") {
Which = CDATATAG
SkipTo = "]]>"
@@ -4389,7 +4549,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
idx++
return NOTAG, "", "", Line, idx
default:
- fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element, line %d\n", line)
+ fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element, line %d\n", ch, line)
}
}
@@ -4773,8 +4933,6 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
inPattern := false
prevName := ""
- mn := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
-
for {
tag, name, attr, _, idx := nextToken(Idx)
Idx = idx
@@ -4890,7 +5048,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
case DOSHRINK:
name = CompressRunsOfSpaces(name)
case DOACCENT:
- name, _, _ = transform.String(mn, name)
+ if IsNotASCII(name) {
+ name = DoAccentTransform(name)
+ }
default:
continue
}
@@ -4918,7 +5078,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
case DOSHRINK:
name = CompressRunsOfSpaces(name)
case DOACCENT:
- name, _, _ = transform.String(mn, name)
+ if IsNotASCII(name) {
+ name = DoAccentTransform(name)
+ }
default:
continue
}
@@ -5002,15 +5164,6 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
}
}
- var replr *strings.Replacer
- if tbls.DeGloss {
- replr = HtmlReplacer()
- }
- var mn transform.Transformer
- if tbls.DeAccent {
- mn = AccentTransformer()
- }
-
// copy with processing flags
if copyRecrd {
@@ -5024,8 +5177,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasMarkup(str) {
str = RemoveUnicodeMarkup(str)
}
- if HasAngleBracket(str) && replr != nil {
- str = replr.Replace(str)
+ if HasAngleBracket(str) {
+ str = DoHtmlReplace(str)
}
}
if tbls.DoMixed {
@@ -5034,8 +5187,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
}
}
if tbls.DeAccent {
- if IsNotASCII(str) && mn != nil {
- str, _, _ = transform.String(mn, str)
+ if IsNotASCII(str) {
+ str = DoAccentTransform(str)
}
}
@@ -5385,8 +5538,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
if HasMarkup(name) {
name = RemoveUnicodeMarkup(name)
}
- if HasAngleBracket(name) && replr != nil {
- name = replr.Replace(name)
+ if HasAngleBracket(name) {
+ name = DoHtmlReplace(name)
}
}
if tbls.DoMixed {
@@ -5395,8 +5548,8 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
}
}
if tbls.DeAccent {
- if IsNotASCII(name) && mn != nil {
- name, _, _ = transform.String(mn, name)
+ if IsNotASCII(name) {
+ name = DoAccentTransform(name)
}
}
if HasFlankingSpace(name) {
@@ -5941,17 +6094,28 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
if doIndex {
if isPipe {
- acc = append(acc, "-head", "<Set>", "-tail", "</Set>", "-hd", "<Rec>", "-tl", "</Rec>")
- acc = append(acc, "-pattern", "INSDSeq", "-pfx", "<Id>", "-sfx", "</Id>")
- acc = append(acc, "-element", "INSDSeq_accession-version")
+ acc = append(acc, "-head", "<IdxDocumentSet>", "-tail", "</IdxDocumentSet>")
+ acc = append(acc, "-hd", " <IdxDocument>\n", "-tl", " </IdxDocument>")
+ acc = append(acc, "-pattern", "INSDSeq", "-pfx", " <IdxUid>", "-sfx", "</IdxUid>\n")
+ acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\n")
} else {
- acc = append(acc, "-head", "\"<Set>\"", "-tail", "\"</Set>\"", "-hd", "\"<Rec>\"", "-tl", "\"</Rec>\"")
- acc = append(acc, "-pattern", "INSDSeq", "-pfx", "\"<Id>\"", "-sfx", "\"</Id>\"")
- acc = append(acc, "-element", "INSDSeq_accession-version")
+ acc = append(acc, "-head", "\"<IdxDocumentSet>\"", "-tail", "\"</IdxDocumentSet>\"")
+ acc = append(acc, "-hd", "\" <IdxDocument>\\n\"", "-tl", "\" </IdxDocument>\"")
+ acc = append(acc, "-pattern", "INSDSeq", "-pfx", "\" <IdxUid>\"", "-sfx", "\"</IdxUid>\\n\"")
+ acc = append(acc, "-element", "INSDSeq_accession-version", "-clr", "-rst", "-tab", "\\n")
}
} else {
acc = append(acc, "-pattern", "INSDSeq", "-ACCN", "INSDSeq_accession-version")
}
+
+ if doIndex {
+ if isPipe {
+ acc = append(acc, "-group", "INSDSeq", "-lbl", " <IdxSearchFields>\n")
+ } else {
+ acc = append(acc, "-group", "INSDSeq", "-lbl", "\" <IdxSearchFields>\\n\"")
+ }
+ }
+
printAccn := true
// collect descriptors
@@ -5959,7 +6123,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
if strings.HasPrefix(args[0], "INSD") {
if doIndex {
- acc = append(acc, "-rst", "-indices")
+ acc = append(acc, "-clr", "-indices")
} else {
if isPipe {
acc = append(acc, "-clr", "-pfx", "\\n", "-element", "&ACCN")
@@ -6065,7 +6229,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
checkAgainstVocabulary(str, "element", insdtags)
if doIndex {
- acc = append(acc, "-block", "INSDFeature", "-indices")
+ acc = append(acc, "-block", "INSDFeature", "-clr", "-indices")
} else {
if isPipe {
acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
@@ -6095,7 +6259,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
checkAgainstVocabulary(str, "element", insdtags)
if doIndex {
- acc = append(acc, "-block", "INSDFeature", "-indices")
+ acc = append(acc, "-block", "INSDFeature", "-clr", "-indices")
} else {
if isPipe {
acc = append(acc, "-block", "INSDFeature", "-sep", "|", "-element")
@@ -6121,9 +6285,9 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
acc = append(acc, "-if", "INSDQualifier_name", "-equals", str[1:])
if doIndex {
if isPipe {
- acc = append(acc, "-indices", "%INSDQualifier_value")
+ acc = append(acc, "-clr", "-indices", "%INSDQualifier_value")
} else {
- acc = append(acc, "-indices", "\"%INSDQualifier_value\"")
+ acc = append(acc, "-clr", "-indices", "\"%INSDQualifier_value\"")
}
} else {
if isPipe {
@@ -6143,7 +6307,7 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
} else {
if doIndex {
acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
- acc = append(acc, "-indices", "INSDQualifier_value")
+ acc = append(acc, "-clr", "-indices", "INSDQualifier_value")
} else {
acc = append(acc, "-if", "INSDQualifier_name", "-equals", str)
acc = append(acc, "-element", "INSDQualifier_value")
@@ -6160,6 +6324,14 @@ func ProcessINSD(args []string, isPipe, addDash, doIndex bool) []string {
}
}
+ if doIndex {
+ if isPipe {
+ acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", " </IdxSearchFields>\n")
+ } else {
+ acc = append(acc, "-group", "INSDSeq", "-clr", "-lbl", "\" </IdxSearchFields>\\n\"")
+ }
+ }
+
return acc
}
@@ -6784,7 +6956,10 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
past := ""
for _, item := range words {
item = strings.ToLower(item)
- if isStopWord[item] {
+ plock.RLock()
+ isSW := isStopWord[item]
+ plock.RUnlock()
+ if isSW {
past = ""
continue
}
@@ -6811,30 +6986,39 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
}
})
case INDICES:
- var tm []string
-
- mn := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+ var term []string
+ var pair []string
addToIndex := func(item, past string) string {
+ if IsNotASCII(item) {
+ item = DoAccentTransform(item)
+ }
item = strings.ToLower(item)
- if IsNotASCII(item) && mn != nil {
- item, _, _ = transform.String(mn, item)
+ if HasBadSpace(item) {
+ item = CleanupBadSpaces(item)
}
if HasMarkup(item) {
item = RemoveUnicodeMarkup(item)
}
item = TrimPunctuation(item)
- if item == "" || isStopWord[item] {
+ if item == "" {
+ return ""
+ }
+ plock.RLock()
+ isSW := isStopWord[item]
+ plock.RUnlock()
+ if isSW {
// skip if stop word, interrupts overlapping word pair chain
return ""
}
ok = true
+ item = html.EscapeString(item)
// add single term
- tm = append(tm, item)
+ term = append(term, item)
if past != "" {
// add informative adjacent word pair
- tm = append(tm, past+" "+item)
+ pair = append(pair, past+" "+item)
}
return item
@@ -6861,18 +7045,31 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
})
if ok {
- // sort array of words and pairs
- sort.Slice(tm, func(i, j int) bool { return tm[i] < tm[j] })
+ // sort arrays of words and pairs
+ sort.Slice(term, func(i, j int) bool { return term[i] < term[j] })
+ sort.Slice(pair, func(i, j int) bool { return pair[i] < pair[j] })
last := ""
- for _, item := range tm {
+ for _, item := range term {
if item == last {
// skip duplicate entry
continue
}
- buffer.WriteString("<Term>")
+ buffer.WriteString(" <NORM>")
buffer.WriteString(item)
- buffer.WriteString("</Term>")
+ buffer.WriteString("</NORM>\n")
+ last = item
+ }
+
+ last = ""
+ for _, item := range pair {
+ if item == last {
+ // skip duplicate entry
+ continue
+ }
+ buffer.WriteString(" <PAIR>")
+ buffer.WriteString(item)
+ buffer.WriteString("</PAIR>\n")
last = item
}
}
@@ -7579,19 +7776,12 @@ func ProcessCommands(cmds *Block, curr *Node, tab, ret string, index, level int,
// PROCESS ONE XML COMPONENT RECORD
// ProcessQuery calls XML combined tokenizer parser on a partitioned string
-func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, replr *strings.Replacer, mn transform.Transformer, action SpecialType) string {
+func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, action SpecialType) string {
if Text == "" || tbls == nil {
return ""
}
- if tbls.DeGloss && replr == nil {
- replr = HtmlReplacer()
- }
- if tbls.DeAccent && mn == nil {
- mn = AccentTransformer()
- }
-
// node farm variables
FarmPos := 0
FarmMax := tbls.FarmSize
@@ -7713,7 +7903,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
idx++
return STARTTAG, str[:], atr[:], idx
default:
- fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
+ fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element\n", ch)
return STARTTAG, str[:], "", idx
}
@@ -7742,7 +7932,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
return STOPTAG, str[:], "", idx
}
- fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
+ fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element\n", ch)
case '?':
// skip ?xml and ?processing instructions
idx++
@@ -7786,7 +7976,7 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
idx++
}
default:
- fmt.Fprintf(os.Stderr, "\nUnexpected punctuation in XML element\n")
+ fmt.Fprintf(os.Stderr, "\nUnexpected punctuation '%c' in XML element\n", ch)
}
}
@@ -7878,8 +8068,8 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
if HasMarkup(name) {
name = RemoveUnicodeMarkup(name)
}
- if HasAngleBracket(name) && replr != nil {
- name = replr.Replace(name)
+ if HasAngleBracket(name) {
+ name = DoHtmlReplace(name)
}
}
if tbls.DoMixed {
@@ -7888,8 +8078,8 @@ func ProcessQuery(Text, parent string, index int, cmds *Block, tbls *Tables, rep
}
}
if tbls.DeAccent {
- if IsNotASCII(name) && mn != nil {
- name, _, _ = transform.String(mn, name)
+ if IsNotASCII(name) {
+ name = DoAccentTransform(name)
}
}
node.Contents = name
@@ -8373,15 +8563,6 @@ func CreateConsumers(cmds *Block, tbls *Tables, parent string, inp <-chan Extrac
// report when this consumer has no more records to process
defer wg.Done()
- var replr *strings.Replacer
- if tbls.DeGloss {
- replr = HtmlReplacer()
- }
- var mn transform.Transformer
- if tbls.DeAccent {
- mn = AccentTransformer()
- }
-
// read partitioned XML from producer channel
for ext := range inp {
@@ -8394,7 +8575,7 @@ func CreateConsumers(cmds *Block, tbls *Tables, parent string, inp <-chan Extrac
continue
}
- str := ProcessQuery(text[:], parent, idx, cmds, tbls, replr, mn, DOQUERY)
+ str := ProcessQuery(text[:], parent, idx, cmds, tbls, DOQUERY)
// send even if empty to get all record counts for reordering
out <- Extract{idx, "", str}
@@ -8436,15 +8617,6 @@ func CreateExaminers(tbls *Tables, parent string, inp <-chan Extract) <-chan Ext
// report when this examiner has no more records to process
defer wg.Done()
- var replr *strings.Replacer
- if tbls.DeGloss {
- replr = HtmlReplacer()
- }
- var mn transform.Transformer
- if tbls.DeAccent {
- mn = AccentTransformer()
- }
-
// read partitioned XML from producer channel
for ext := range inp {
@@ -8457,7 +8629,7 @@ func CreateExaminers(tbls *Tables, parent string, inp <-chan Extract) <-chan Ext
continue
}
- id := ProcessQuery(text[:], parent, 0, nil, tbls, replr, mn, DOINDEX)
+ id := ProcessQuery(text[:], parent, 0, nil, tbls, DOINDEX)
// send even if empty to get all record counts for reordering
out <- Extract{idx, id, text}
@@ -8977,13 +9149,13 @@ func main() {
// data cleanup flags
case "-compress":
doCompress = true
- case "-cleanup":
+ case "-cleanup", "-spaces":
doCleanup = true
case "-strict":
deGloss = true
- case "-relaxed":
+ case "-mixed", "-relaxed":
doMixed = true
- case "-plain":
+ case "-accent", "-plain":
deAccent = true
// debugging flags
case "-prepare":
@@ -9745,7 +9917,7 @@ func main() {
func(rec int, ofs int64, str string) {
recordCount++
- id := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOINDEX)
+ id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
if id == "" {
return
}
@@ -9936,7 +10108,7 @@ func main() {
func(rec int, ofs int64, str string) {
recordCount++
- id := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOINDEX)
+ id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
if id == "" {
return
}
@@ -9991,7 +10163,7 @@ func main() {
func(rec int, ofs int64, str string) {
recordCount++
- res := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOVALID)
+ res := ProcessQuery(str[:], parent, rec, nil, tbls, DOVALID)
if res == "" {
return
}
@@ -10034,7 +10206,7 @@ func main() {
func(rec int, ofs int64, str string) {
recordCount++
- id := ProcessQuery(str[:], parent, rec, nil, tbls, nil, nil, DOINDEX)
+ id := ProcessQuery(str[:], parent, rec, nil, tbls, DOINDEX)
if id == "" {
return
}
@@ -10158,7 +10330,7 @@ func main() {
PartitionPattern(topPattern, star, rdr,
func(rec int, ofs int64, str string) {
beginTime := time.Now()
- ProcessQuery(str[:], parent, rec, cmds, tbls, nil, nil, DOQUERY)
+ ProcessQuery(str[:], parent, rec, cmds, tbls, DOQUERY)
endTime := time.Now()
duration := endTime.Sub(beginTime)
micro := int(float64(duration.Nanoseconds()) / 1e3)
@@ -10362,7 +10534,7 @@ func main() {
cmds.Position = ""
// process single selected record
- res := ProcessQuery(qry[:], parent, idx, cmds, tbls, nil, nil, DOQUERY)
+ res := ProcessQuery(qry[:], parent, idx, cmds, tbls, DOQUERY)
if res != "" {
fmt.Printf("%s\n", res)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ncbi-entrez-direct.git
More information about the debian-med-commit
mailing list