[med-svn] [ncbi-entrez-direct] 03/08: New upstream version 6.10.20170123+ds
Aaron M. Ucko
ucko at moszumanska.debian.org
Wed Jan 25 03:00:00 UTC 2017
This is an automated email from the git hooks/post-receive script.
ucko pushed a commit to branch master
in repository ncbi-entrez-direct.
commit b58755d334de654f2386b839ceef9e9954317b86
Author: Aaron M. Ucko <ucko at debian.org>
Date: Tue Jan 24 21:15:47 2017 -0500
New upstream version 6.10.20170123+ds
---
edirect.pl | 99 ++---------------
run-ncbi-converter | 95 ++++++++++++++++
xtract.go | 319 ++++++++++++++++++++++++++++++++++++++++++++++-------
3 files changed, 387 insertions(+), 126 deletions(-)
diff --git a/edirect.pl b/edirect.pl
index 9bf7d77..3ab53f4 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -87,7 +87,7 @@ use constant true => 1;
# EDirect version number
-$version = "6.00";
+$version = "6.10";
# URL address components
@@ -154,7 +154,6 @@ sub clearflags {
$mxdate = "";
$name = "";
$neighbor = false;
- $nogi = false;
$num = "";
$organism = "";
$output = "";
@@ -1633,45 +1632,6 @@ sub fix_bad_encoding {
return $data;
}
-sub accn_to_gi {
-
- my $dbsx = shift (@_);
- my $accn = shift (@_);
- my $nogi = shift (@_);
-
- my $id = 0;
-
- if ( $dbsx eq "" or $accn eq "" ) {
- return 0;
- }
-
- if ( $nogi ) {
- return $accn;
- }
-
- if ( $dbsx ne "nucleotide" and
- $dbsx ne "nuccore" and
- $dbsx ne "est" and
- $dbsx ne "gss" and
- $dbsx ne "protein" ) {
- print STDERR "\nFor -db $dbsx, the -id argument must be numeric\n\n";
- return 0;
- }
-
- $url = $base . $esearch;
- $url .= "?db=$dbsx&&term=$accn%5bACCN%5d";
-
- $output = get ($url);
-
- if ( $output eq "" ) {
- print STDERR "No get_count output returned from '$url'\n";
- }
-
- $id = $1 if ($output =~ /<Id>(\S+)<\/Id>/);
-
- return $id;
-}
-
sub esmry {
my $dbase = shift (@_);
@@ -1691,7 +1651,6 @@ sub esmry {
my $http = shift (@_);
my $alias = shift (@_);
my $basx = shift (@_);
- my $nogi = shift (@_);
$dbase = lc($dbase);
@@ -1711,20 +1670,6 @@ sub esmry {
}
}
- if ($id !~ /^\d+$/ and $id !~ /,/) {
-
- # convert single accession to GI number
-
- $id = accn_to_gi ($dbase, $id, $nogi);
-
- if ( $id eq "0" ) {
-
- # id "0" is an unrecognized accession
-
- return;
- }
- }
-
$url = $base . $esummary;
$arg = "db=$dbase&id=$id";
@@ -2055,7 +2000,6 @@ sub eftch {
"extrafeat=i" => \$extrafeat,
"start=i" => \$min,
"stop=i" => \$max,
- "nogi" => \$nogi,
"email=s" => \$emaddr,
"tool=s" => \$tuul,
"pipe" => \$pipe,
@@ -2149,7 +2093,7 @@ sub eftch {
if ( $type eq "docsum" or $fnc eq "-summary" ) {
esmry ( $dbase, $web, $key, $num, $id, $mode, $min, $max, $tool, $email,
- $silent, $verbose, $debug, $log, $http, $alias, $basx, $nogi );
+ $silent, $verbose, $debug, $log, $http, $alias, $basx );
return;
}
@@ -2165,20 +2109,6 @@ sub eftch {
if ( $id ne "" ) {
- if ($id !~ /^\d+$/ and $id !~ /,/) {
-
- # convert single accession to GI number
-
- $id = accn_to_gi ($dbase, $id, $nogi);
-
- if ( $id eq "0" ) {
-
- # id "0" is an unrecognized accession
-
- return;
- }
- }
-
my @ids = split (',', $id);
foreach $uid (@ids) {
print "$uid\n";
@@ -3090,6 +3020,7 @@ sub elink {
$link_help,
"db=s" => \$db,
"id=s" => \$id,
+ "format=s" => \$type,
"target=s" => \$dbto,
"name=s" => \$name,
"related" => \$related,
@@ -3099,7 +3030,6 @@ sub elink {
"batch" => \$batch,
"holding=s" => \$holding,
"label=s" => \$lbl,
- "nogi" => \$nogi,
"email=s" => \$emaddr,
"tool=s" => \$tuul,
"help" => \$help,
@@ -3191,20 +3121,6 @@ sub elink {
if ( $dbase ne "" and $id ne "" ) {
- if ($id !~ /^\d+$/ and $id !~ /,/) {
-
- # convert single accession to GI number
-
- $id = accn_to_gi ($dbase, $id, $nogi);
-
- if ( $id eq "0" ) {
-
- # id "0" is an unrecognized accession
-
- return;
- }
- }
-
# process db and id command-line arguments instead of getting from history
$url = $base . $elink;
@@ -3218,6 +3134,9 @@ sub elink {
$arg .= "&retmode=$mode";
}
$arg .= "&id=$id";
+ if ( $type eq "acc" ) {
+ $arg .= "&idtype=acc";
+ }
$data = do_post ($url, $arg, $tool, $email, true);
@@ -3276,6 +3195,9 @@ sub elink {
}
$arg .= "&id=";
$arg .= join ('&id=', @ids);
+ if ( $type eq "acc" ) {
+ $arg .= "&idtype=acc";
+ }
$data = do_post ($url, $arg, $tool, $email, true);
@@ -3306,6 +3228,9 @@ sub elink {
if ( $mode ne "" ) {
$arg .= "&retmode=$mode";
}
+ if ( $type eq "acc" ) {
+ $arg .= "&idtype=acc";
+ }
$wb = $web;
$ky = $key;
diff --git a/run-ncbi-converter b/run-ncbi-converter
new file mode 100755
index 0000000..d54f040
--- /dev/null
+++ b/run-ncbi-converter
@@ -0,0 +1,95 @@
+#!/usr/bin/perl -w
+use strict;
+
+use File::Path;
+use Net::FTP;
+use POSIX qw(uname);
+
+my $cache_dir = "$ENV{HOME}/.cache/ncbi-converters";
+if (defined $ENV{NCBI_CONVERTER_DIR}) {
+ $cache_dir = $ENV{NCBI_CONVERTER_DIR};
+}
+
+my $server = 'ftp.ncbi.nlm.nih.gov';
+my $platform = DetectPlatform();
+my $dir = "/toolbox/ncbi_tools/converters/by_platform/$platform";
+my $ext = ($platform eq 'win') ? 'zip' : 'gz';
+my $binext = ($platform eq 'win') ? '.exe' : '';
+my $archive = "$ARGV[0].$platform.$ext";
+my $executable = "$cache_dir/" . $ARGV[0] . $binext;
+
+if ( ! -d $cache_dir ) {
+ File::Path::make_path($cache_dir)
+ or die "Unable to ensure the existence of $cache_dir: $!";
+}
+
+my $ftp = new Net::FTP($server, Passive => 1)
+ or die "Unable to connect to FTP server: $!";
+$ftp->login or die "Unable to log in to FTP server";
+$ftp->cwd($dir) or die "Unable to change to $dir";
+$ftp->binary or warn "Unable to set binary mode";
+
+my $time = $ftp->mdtm($archive);
+my @stats = stat "$cache_dir/$ARGV[0]$binext";
+if ( !@stats || $stats[9] < $time) {
+ $ftp->get("$archive", "$cache_dir/$archive")
+ or die "Unable to download $archive";
+ utime $time, $time, "$cache_dir/$archive";
+ my $pid = fork();
+ if ($pid < 0) {
+ die "Unable to fork for unpacking: $!";
+ } elsif ($pid > 0) {
+ waitpid($pid, 0);
+ chmod(0777 &~ umask, $executable);
+ utime $time, $time, $executable;
+ } else {
+ chdir($cache_dir);
+ if ($platform eq 'win') {
+ exec('unzip', $archive);
+ } else {
+ system('gunzip', '-n', $archive);
+ rename("$ARGV[0].$platform", $ARGV[0]);
+ exit 0;
+ }
+ }
+}
+
+shift;
+exec($executable, @ARGV);
+
+sub DetectPlatform
+{
+ my @uname = uname();
+ my $OS = $uname[0];
+ my $CPU = $uname[4];
+ my $pf;
+ my $last_built;
+ if ($OS =~ /^CYGWIN/) {
+ $pf = 'win';
+ } elsif ($OS eq 'Darwin') {
+ $pf = 'mac';
+ } elsif ($OS eq 'Linux') {
+ if ($CPU =~ /i\d86/) {
+ $pf = 'linux32';
+ $last_built = 'November 2014';
+ } elsif ($CPU eq 'x86_64') {
+ $pf = 'linux64';
+ }
+ } elsif ($OS eq 'SunOS') {
+ if ($CPU =~ /^s/) {
+ $pf = 'solaris';
+ $last_built = 'March 2014';
+ } else {
+ $pf = 'solaris-x86';
+ $last_built = 'September 2014';
+ }
+ } else {
+ die "No prebuilt binaries available for $OS/$CPU";
+ }
+
+ if (defined $last_built) {
+ warn "NCBI no longer builds for $OS/$CPU; using a binary from"
+ . " $last_built";
+ }
+ return $pf;
+}
diff --git a/xtract.go b/xtract.go
index ef021ee..8ae50c6 100644
--- a/xtract.go
+++ b/xtract.go
@@ -51,6 +51,7 @@ import (
"io"
"math"
"os"
+ "path"
"runtime"
"runtime/debug"
"runtime/pprof"
@@ -63,7 +64,7 @@ import (
// VERSION AND HELP MESSAGE TEXT
-const xtractVersion = "6.00"
+const xtractVersion = "6.10"
const xtractHelp = `
Overview
@@ -188,6 +189,7 @@ Phrase Processing
-words Split at punctuation marks
-pairs Adjacent informative words
-phrase Experimental index generation
+ -letters Separate individual letters
Sequence Coordinates
@@ -215,7 +217,7 @@ Miscellaneous
Reformatting
- -format [compact|indent|expand]
+ -format [compact|flush|indent|expand]
Modification
@@ -314,11 +316,16 @@ Debugging
-debug Display run-time parameter summary
-empty Flag records with no output
- -index Print record index numbers
+ -ident Print record index numbers
-stats Show processing time for each record
-timer Report processing duration and rate
-trial Optimize -proc value, requires -input
+Record Indexing
+
+ -index Name of element to use for indexing
+ -local Base path for individual XML files
+
Internal Component Performance
-chunk ReadBlocks
@@ -692,6 +699,37 @@ Genome Range
AKAP17A A-kinase anchoring protein 17A
ASMT acetylserotonin O-methyltransferase
+3'UTR Sequences
+
+ ThreePrimeUTRs() {
+ xtract -pattern INSDSeq -ACC INSDSeq_accession-version -SEQ INSDSeq_sequence \
+ -block INSDFeature -if INSDFeature_key -equals CDS \
+ -pfc "\n" -element "&ACC" -rst -last INSDInterval_to -element "&SEQ" |
+ while read acc pos seq
+ do
+ if [ $pos -lt ${#seq} ]
+ then
+ echo -e ">$acc 3'UTR: $((pos+1))..${#seq}"
+ echo "${seq:$pos}" | fold -w 50
+ elif [ $pos -ge ${#seq} ]
+ then
+ echo -e ">$acc NO 3'UTR"
+ fi
+ done
+ }
+
+ esearch -db nuccore -query "5.5.1.19 [ECNO]" |
+ efilter -molecule mrna -source refseq |
+ efetch -format gbc | ThreePrimeUTRs
+
+ >NM_001328461.1 3'UTR: 1737..1871
+ gatgaatatagagttactgtgttgtaagctaatcatcatactgatgcaag
+ tgcattatcacatttacttctgctgatgattgttcataagattatgagtt
+ agccatttatcaaaaaaaaaaaaaaaaaaaaaaaa
+ >NM_001316759.1 3'UTR: 1628..1690
+ atccgagtaattcggaatcttgtccaattttatatagcctatattaatac
+ ...
+
Amino Acid Substitutions
ApplySNPs() {
@@ -849,6 +887,19 @@ Phrase Searching
5-hydroxyindoleacetic acid
5-hydroxytryptophan
...
+
+Mammalian Sequence Download
+
+ ftp-ls ftp.ncbi.nlm.nih.gov ncbi-asn1 |
+ grep -e gbmam -e gbpri -e gbrod |
+ xargs ftp-cp ftp.ncbi.nlm.nih.gov ncbi-asn1
+
+Human Subset Extraction
+
+ for fl in gbpri?.aso.gz gbpri??.aso.gz
+ do
+ run-ncbi-converter asn2all -i "$fl" -a t -b -c -O 9606 -f s > ${fl%.aso.gz}.xml
+ done
`
const pubMedArtSample = `
@@ -1506,6 +1557,7 @@ const (
WORDS
PAIRS
PHRASE
+ LETTERS
PFX
SFX
SEP
@@ -1647,6 +1699,7 @@ var argTypeIs = map[string]ArgumentType{
"-words": EXTRACTION,
"-pairs": EXTRACTION,
"-phrase": EXTRACTION,
+ "-letters": EXTRACTION,
"-num": EXTRACTION,
"-len": EXTRACTION,
"-sum": EXTRACTION,
@@ -1691,6 +1744,7 @@ var opTypeIs = map[string]OpType{
"-words": WORDS,
"-pairs": PAIRS,
"-phrase": PHRASE,
+ "-letters": LETTERS,
"-pfx": PFX,
"-sfx": SFX,
"-sep": SEP,
@@ -1936,6 +1990,8 @@ type Tables struct {
InElement [256]bool
ChanDepth int
FarmSize int
+ Hd string
+ Tl string
}
type Node struct {
@@ -2669,7 +2725,7 @@ func ParseArguments(args []string, pttrn string) *Block {
op := &Operation{Type: status, Value: ""}
comm = append(comm, op)
status = UNSET
- case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE:
+ case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, LETTERS:
case NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
case TAB, RET, PFX, SFX, SEP, LBL, PFC, DEF:
case UNSET:
@@ -2820,7 +2876,8 @@ func ParseArguments(args []string, pttrn string) *Block {
switch status {
case UNSET:
status = nextStatus(str)
- case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
+ case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, LETTERS,
+ NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
for !strings.HasPrefix(str, "-") {
// create one operation per argument, even if under a single -element statement
op := &Operation{Type: status, Value: str}
@@ -2997,6 +3054,8 @@ type XMLReader struct {
Reader io.Reader
Buffer []byte
Remainder string
+ Position int64
+ Delta int
Closed bool
Docompress bool
Docleanup bool
@@ -3053,6 +3112,10 @@ func (rdr *XMLReader) NextBlock() string {
return "", false, true
}
+ // keep track of file offset
+ rdr.Position += int64(rdr.Delta)
+ rdr.Delta = n
+
// slice of actual characters read
bufr := rdr.Buffer[:n+m]
@@ -3125,7 +3188,7 @@ func (rdr *XMLReader) NextBlock() string {
// PARSE XML BLOCK STREAM INTO STRINGS FROM <PATTERN> TO </PATTERN>
// PartitionPattern splits XML input by pattern and sends individual records to a callback
-func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string)) {
+func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, int64, string)) {
if pat == "" || rdr == nil || proc == nil {
return
@@ -3275,6 +3338,8 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
pos := 0
next := 0
+ offset := int64(0)
+
rec := 0
scr := newScanner(pat)
@@ -3298,6 +3363,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
if level == 0 {
inPattern = true
begin = pos
+ offset = rdr.Position + int64(pos)
}
level++
} else if match == STOPPATTERN {
@@ -3309,7 +3375,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
str := accumulator.String()
if str != "" {
rec++
- proc(rec, str[:])
+ proc(rec, offset, str[:])
}
// reset accumulator
accumulator.Reset()
@@ -3343,6 +3409,8 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
pos := 0
next := 0
+ offset := int64(0)
+
rec := 0
scr := newScanner(pat)
@@ -3467,6 +3535,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
if level == 0 {
inPattern = true
begin = pos
+ offset = rdr.Position + int64(pos)
}
level++
} else if match == STOPPATTERN {
@@ -3478,7 +3547,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
str := accumulator.String()
if str != "" {
rec++
- proc(rec, str[:])
+ proc(rec, offset, str[:])
}
// reset accumulator
accumulator.Reset()
@@ -4357,6 +4426,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
args = args[1:]
compRecrd := false
+ flushLeft := false
wrapAttrs := false
ret := "\n"
frst := true
@@ -4367,6 +4437,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
// compress to one record per line
compRecrd = true
ret = ""
+ case "flush", "flushed", "left":
+ // suppress line indentation
+ flushLeft = true
case "expand", "expanded", "verbose", "@":
// each attribute on its own line
wrapAttrs = true
@@ -4417,7 +4490,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
// function to indent a specified number of spaces
doIndent := func(indt int) {
- if compRecrd {
+ if compRecrd || flushLeft {
return
}
i := indt
@@ -4698,7 +4771,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
pat := args[1]
PartitionPattern(pat, "", in,
- func(rec int, str string) {
+ func(rec int, ofs int64, str string) {
recordCount++
byteCount += len(str)
})
@@ -4728,7 +4801,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
sendPatterns := func(pat string, out chan<- string) {
defer close(out)
PartitionPattern(pat, "", in,
- func(rec int, str string) {
+ func(rec int, ofs int64, str string) {
out <- str
})
}
@@ -5706,7 +5779,7 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
}
switch stat {
- case ELEMENT, TERMS, WORDS, PAIRS, PHRASE, VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV:
+ case ELEMENT, TERMS, WORDS, PAIRS, PHRASE, LETTERS, VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV:
exploreElements(func(str string, lvl int) {
if str != "" {
acc(str)
@@ -6027,6 +6100,17 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
buffer.WriteString("</Phrase>")
}
})
+ case LETTERS:
+ processElement(func(str string) {
+ if str != "" {
+ for _, ch := range str {
+ ok = true
+ buffer.WriteString(between)
+ buffer.WriteRune(ch)
+ between = sep
+ }
+ }
+ })
case LEN:
length := 0
@@ -6217,7 +6301,8 @@ func ProcessInstructions(commands []*Operation, curr *Node, mask, tab, ret strin
str := op.Value
switch op.Type {
- case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
+ case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, LETTERS,
+ NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
txt, ok := ProcessClause(curr, op.Stages, mask, tab, pfx, sfx, sep, def, op.Type, index, level, variables)
if ok {
tab = col
@@ -7045,7 +7130,7 @@ func ParseXML(Text, parent string, tbls *Tables) (*Node, bool) {
}
// ProcessQuery calls XML combined tokenizer parser on a partitioned string
-func ProcessQuery(text, parent, hd, tl string, index int, cmds *Block, tbls *Tables) string {
+func ProcessQuery(text, parent string, index int, cmds *Block, tbls *Tables) string {
if text == "" || cmds == nil || tbls == nil {
return ""
@@ -7065,8 +7150,8 @@ func ProcessQuery(text, parent, hd, tl string, index int, cmds *Block, tbls *Tab
ok = false
- if hd != "" {
- buffer.WriteString(hd[:])
+ if tbls.Hd != "" {
+ buffer.WriteString(tbls.Hd[:])
}
// start processing at top of command tree and top of XML subregion selected by -pattern
@@ -7078,8 +7163,8 @@ func ProcessQuery(text, parent, hd, tl string, index int, cmds *Block, tbls *Tab
}
})
- if tl != "" {
- buffer.WriteString(tl[:])
+ if tbls.Tl != "" {
+ buffer.WriteString(tbls.Tl[:])
}
if ret != "" {
@@ -7144,7 +7229,7 @@ func XMLProducer(pat, star string, rdr *XMLReader, out chan<- Extract) {
// partition all input by pattern and send XML substring to available consumer through channel
PartitionPattern(pat, star, rdr,
- func(rec int, str string) {
+ func(rec int, ofs int64, str string) {
out <- Extract{rec, str}
})
}
@@ -7168,7 +7253,7 @@ func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extra
}
// XMLConsumer reads partitioned XML from channel and calls parser for processing
-func XMLConsumer(cmds *Block, tbls *Tables, parent, hd, tl string, wg *sync.WaitGroup, inp <-chan Extract, out chan<- Extract) {
+func XMLConsumer(cmds *Block, tbls *Tables, parent string, wg *sync.WaitGroup, inp <-chan Extract, out chan<- Extract) {
// report when this consumer has no more records to process
defer wg.Done()
@@ -7185,14 +7270,14 @@ func XMLConsumer(cmds *Block, tbls *Tables, parent, hd, tl string, wg *sync.Wait
continue
}
- str := ProcessQuery(text[:], parent, hd, tl, idx, cmds, tbls)
+ str := ProcessQuery(text[:], parent, idx, cmds, tbls)
// send even if empty to get all record counts for reordering
out <- Extract{idx, str}
}
}
-func CreateConsumers(cmds *Block, tbls *Tables, parent, hd, tl string, numServers int, inp <-chan Extract) <-chan Extract {
+func CreateConsumers(cmds *Block, tbls *Tables, parent string, numServers int, inp <-chan Extract) <-chan Extract {
if tbls == nil {
return nil
@@ -7209,7 +7294,7 @@ func CreateConsumers(cmds *Block, tbls *Tables, parent, hd, tl string, numServer
// launch multiple consumer goroutines
for i := 0; i < numServers; i++ {
wg.Add(1)
- go XMLConsumer(cmds, tbls, parent, hd, tl, &wg, inp, out)
+ go XMLConsumer(cmds, tbls, parent, &wg, inp, out)
}
// launch separate anonymous goroutine to wait until all consumers are done, then close single output channel, so unshuffler can range over channel
@@ -7275,13 +7360,19 @@ func main() {
// debugging
dbug := false
mpty := false
- indx := false
+ idnt := false
stts := false
timr := false
// profiling
prfl := false
+ // element to use as local data index
+ indx := ""
+
+ // path for local data indexed as trie
+ local := ""
+
// alternative source of sample record, processed a designated number of times, looping for each -proc from 1 to nCPU (undocumented)
testCount := 0
testType := ""
@@ -7356,6 +7447,24 @@ func main() {
fileName = args[1]
// skip past first of two arguments
args = args[1:]
+ // data element for indexing
+ case "-index":
+ if len(args) < 2 {
+ fmt.Fprintf(os.Stderr, "\nERROR: Index element is missing\n")
+ os.Exit(1)
+ }
+ indx = args[1]
+ // skip past first of two arguments
+ args = args[1:]
+ // data element for indexing
+ case "-local":
+ if len(args) < 2 {
+ fmt.Fprintf(os.Stderr, "\nERROR: Data path is missing\n")
+ os.Exit(1)
+ }
+ local = args[1]
+ // skip past first of two arguments
+ args = args[1:]
// data cleanup flags
case "-compress":
doCompress = true
@@ -7366,8 +7475,8 @@ func main() {
dbug = true
case "-empty":
mpty = true
- case "-index":
- indx = true
+ case "-ident":
+ idnt = true
case "-stats", "-stat":
stts = true
case "-timer":
@@ -7982,6 +8091,10 @@ func main() {
}
}
+ // per-record head and tail passed in master table
+ tbls.Hd = hd
+ tbls.Tl = tl
+
// ENSURE PRESENCE OF PATTERN ARGUMENT
if len(args) < 1 {
@@ -8023,6 +8136,134 @@ func main() {
os.Exit(1)
}
+ // SAVE XML COMPONENT RECORDS TO LOCAL DIRECTORY INDEXED BY TRIE ON IDENTIFIER
+
+ // -local plus -index saves XML files in trie-based directory structure (experimental)
+ if local != "" && indx != "" {
+
+ prnt, match := SplitInTwoAt(indx, "/", RIGHT)
+ match, attrib := SplitInTwoAt(match, "@", LEFT)
+
+ var idbuf bytes.Buffer
+
+ makeTrie := func(str string) string {
+ idbuf.Reset()
+ between := ""
+ for _, ch := range str {
+ if ch == '.' {
+ break
+ }
+ idbuf.WriteString(between)
+ idbuf.WriteRune(ch)
+ between = "/"
+ }
+ trie := idbuf.String()
+ return trie
+ }
+
+ PartitionPattern(topPattern, star, rdr,
+ func(rec int, ofs int64, str string) {
+ pat, ok := ParseXML(str[:], parent, tbls)
+ if !ok {
+ return
+ }
+ trie := ""
+ file := ""
+ num := 0
+ ExploreElements(pat, "", prnt, match, attrib, false, 1,
+ func(stn string, lvl int) {
+ num++
+ if stn != "" {
+ trie = makeTrie(stn)
+ file = stn
+ }
+ })
+ if trie == "" || file == "" || num != 1 {
+ return
+ }
+ dpath := path.Join(local, trie)
+ if dpath == "" {
+ return
+ }
+ _, err := os.Stat(dpath)
+ if err != nil && os.IsNotExist(err) {
+ err = os.MkdirAll(dpath, os.ModePerm)
+ }
+ if err != nil {
+ fmt.Println(err.Error())
+ return
+ }
+ fpath := path.Join(dpath, file+".xml")
+ if fpath == "" {
+ return
+ }
+ _, err = os.Stat(fpath)
+ if err != nil && os.IsNotExist(err) {
+ fl, err := os.Create(fpath)
+ if err != nil {
+ fmt.Println(err.Error())
+ return
+ }
+ fl.Close()
+ }
+ fl, err := os.OpenFile(fpath, os.O_RDWR, 0644)
+ if err != nil {
+ fmt.Println(err.Error())
+ return
+ }
+ fl.WriteString(str)
+ fl.WriteString("\n")
+ err = fl.Sync()
+ if err != nil {
+ fmt.Println(err.Error())
+ }
+ fl.Close()
+ fmt.Printf("%d\t%s\n", rec, fpath)
+ })
+
+ return
+ }
+
+ // GENERATE RECORD INDEX ON XML INPUT FILE
+
+ // -index command prints record identifier, file offset, and XML size
+ if indx != "" {
+
+ prnt, match := SplitInTwoAt(indx, "/", RIGHT)
+ match, attrib := SplitInTwoAt(match, "@", LEFT)
+
+ // legend := "REC\tID\tOFST\tSIZE"
+
+ PartitionPattern(topPattern, star, rdr,
+ func(rec int, ofs int64, str string) {
+ pat, ok := ParseXML(str[:], parent, tbls)
+ if !ok {
+ return
+ }
+ id := ""
+ num := 0
+ ExploreElements(pat, "", prnt, match, attrib, false, 1,
+ func(stn string, lvl int) {
+ num++
+ if stn != "" {
+ id = stn
+ }
+ })
+ if id == "" || num != 1 {
+ return
+ }
+ /*
+ if legend != "" {
+ fmt.Printf("%s\n", legend)
+ legend = ""
+ }
+ */
+ fmt.Printf("%d\t%s\t%d\t%d\n", rec, id, ofs, len(str))
+ })
+
+ return
+ }
+
// PARSE AND VALIDATE EXTRACTION ARGUMENTS
// parse nested exploration instruction from command-line arguments
@@ -8037,12 +8278,12 @@ func main() {
// -stats with an extraction command prints XML size and processing time for each record
if stts {
- legend := "REC\tSIZE\tTIME"
+ legend := "REC\tOFST\tSIZE\tTIME"
PartitionPattern(topPattern, star, rdr,
- func(rec int, str string) {
+ func(rec int, ofs int64, str string) {
beginTime := time.Now()
- ProcessQuery(str[:], parent, "", "", rec, cmds, tbls)
+ ProcessQuery(str[:], parent, rec, cmds, tbls)
endTime := time.Now()
duration := endTime.Sub(beginTime)
micro := int(float64(duration.Nanoseconds()) / 1e3)
@@ -8050,7 +8291,7 @@ func main() {
fmt.Printf("%s\n", legend)
legend = ""
}
- fmt.Printf("%d\t%d\t%d\n", rec, len(str), micro)
+ fmt.Printf("%d\t%d\t%d\t%d\n", rec, ofs, len(str), micro)
})
return
@@ -8079,7 +8320,7 @@ func main() {
}
close(out)
}(xmlq)
- tblq := CreateConsumers(cmds, tbls, parent, "", "", numServ, xmlq)
+ tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq)
if xmlq == nil || tblq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
@@ -8143,7 +8384,7 @@ func main() {
}
xmlq := CreateProducer(topPattern, star, rdr, tbls)
- tblq := CreateConsumers(cmds, tbls, parent, hd, tl, numServ, xmlq)
+ tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq)
if xmlq == nil || tblq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
@@ -8201,7 +8442,7 @@ func main() {
if cmds.Position == "first" {
PartitionPattern(topPattern, star, rdr,
- func(rec int, str string) {
+ func(rec int, ofs int64, str string) {
if rec == 1 {
qry = str
idx = rec
@@ -8211,7 +8452,7 @@ func main() {
} else if cmds.Position == "last" {
PartitionPattern(topPattern, star, rdr,
- func(rec int, str string) {
+ func(rec int, ofs int64, str string) {
qry = str
idx = rec
})
@@ -8226,7 +8467,7 @@ func main() {
}
PartitionPattern(topPattern, star, rdr,
- func(rec int, str string) {
+ func(rec int, ofs int64, str string) {
if rec == number {
qry = str
idx = rec
@@ -8242,7 +8483,7 @@ func main() {
cmds.Position = ""
// process single selected record
- res := ProcessQuery(qry[:], parent, "", "", idx, cmds, tbls)
+ res := ProcessQuery(qry[:], parent, idx, cmds, tbls)
if res != "" {
fmt.Printf("%s\n", res)
}
@@ -8260,7 +8501,7 @@ func main() {
}
// launch consumer goroutines to parse and explore partitioned XML objects
- tblq := CreateConsumers(cmds, tbls, parent, hd, tl, numServers, xmlq)
+ tblq := CreateConsumers(cmds, tbls, parent, numServers, xmlq)
if tblq == nil {
fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumers\n")
os.Exit(1)
@@ -8319,7 +8560,7 @@ func main() {
buffer.WriteString("\n")
}
- // printResult prints output for current pattern, handles -empty and -index flags, and periodically flushes buffer
+ // printResult prints output for current pattern, handles -empty and -ident flags, and periodically flushes buffer
printResult := func(curr Extract) {
str := curr.Text
@@ -8342,7 +8583,7 @@ func main() {
okay = true
- if indx {
+ if idnt {
idx := curr.Index
val := strconv.Itoa(idx)
buffer.WriteString(val[:])
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ncbi-entrez-direct.git
More information about the debian-med-commit
mailing list