[med-svn] [ncbi-entrez-direct] 03/08: New upstream version 6.10.20170123+ds

Aaron M. Ucko ucko at moszumanska.debian.org
Wed Jan 25 03:00:00 UTC 2017


This is an automated email from the git hooks/post-receive script.

ucko pushed a commit to branch master
in repository ncbi-entrez-direct.

commit b58755d334de654f2386b839ceef9e9954317b86
Author: Aaron M. Ucko <ucko at debian.org>
Date:   Tue Jan 24 21:15:47 2017 -0500

    New upstream version 6.10.20170123+ds
---
 edirect.pl         |  99 ++---------------
 run-ncbi-converter |  95 ++++++++++++++++
 xtract.go          | 319 ++++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 387 insertions(+), 126 deletions(-)

diff --git a/edirect.pl b/edirect.pl
index 9bf7d77..3ab53f4 100755
--- a/edirect.pl
+++ b/edirect.pl
@@ -87,7 +87,7 @@ use constant true  => 1;
 
 # EDirect version number
 
-$version = "6.00";
+$version = "6.10";
 
 # URL address components
 
@@ -154,7 +154,6 @@ sub clearflags {
   $mxdate = "";
   $name = "";
   $neighbor = false;
-  $nogi = false;
   $num = "";
   $organism = "";
   $output = "";
@@ -1633,45 +1632,6 @@ sub fix_bad_encoding {
   return $data;
 }
 
-sub accn_to_gi {
-
-  my $dbsx = shift (@_);
-  my $accn = shift (@_);
-  my $nogi = shift (@_);
-
-  my $id = 0;
-
-  if ( $dbsx eq "" or $accn eq "" ) {
-    return 0;
-  }
-
-  if ( $nogi ) {
-    return $accn;
-  }
-
-  if ( $dbsx ne "nucleotide" and
-       $dbsx ne "nuccore" and
-       $dbsx ne "est" and
-       $dbsx ne "gss" and
-       $dbsx ne "protein" ) {
-    print STDERR "\nFor -db $dbsx, the -id argument must be numeric\n\n";
-    return 0;
-  }
-
-  $url = $base . $esearch;
-  $url .= "?db=$dbsx&&term=$accn%5bACCN%5d";
-
-  $output = get ($url);
-
-  if ( $output eq "" ) {
-    print STDERR "No get_count output returned from '$url'\n";
-  }
-
-  $id = $1 if ($output =~ /<Id>(\S+)<\/Id>/);
-
-  return $id;
-}
-
 sub esmry {
 
   my $dbase = shift (@_);
@@ -1691,7 +1651,6 @@ sub esmry {
   my $http = shift (@_);
   my $alias = shift (@_);
   my $basx = shift (@_);
-  my $nogi = shift (@_);
 
   $dbase = lc($dbase);
 
@@ -1711,20 +1670,6 @@ sub esmry {
       }
     }
 
-    if ($id !~ /^\d+$/ and $id !~ /,/) {
-
-      # convert single accession to GI number
-
-      $id = accn_to_gi ($dbase, $id, $nogi);
-
-      if ( $id eq "0" ) {
-
-        # id "0" is an unrecognized accession
-
-        return;
-      }
-    }
-
     $url = $base . $esummary;
 
     $arg = "db=$dbase&id=$id";
@@ -2055,7 +2000,6 @@ sub eftch {
     "extrafeat=i" => \$extrafeat,
     "start=i" => \$min,
     "stop=i" => \$max,
-    "nogi" => \$nogi,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "pipe" => \$pipe,
@@ -2149,7 +2093,7 @@ sub eftch {
   if ( $type eq "docsum" or $fnc eq "-summary" ) {
 
     esmry ( $dbase, $web, $key, $num, $id, $mode, $min, $max, $tool, $email,
-            $silent, $verbose, $debug, $log, $http, $alias, $basx, $nogi );
+            $silent, $verbose, $debug, $log, $http, $alias, $basx );
 
     return;
   }
@@ -2165,20 +2109,6 @@ sub eftch {
 
     if ( $id ne "" ) {
 
-      if ($id !~ /^\d+$/ and $id !~ /,/) {
-
-        # convert single accession to GI number
-
-        $id = accn_to_gi ($dbase, $id, $nogi);
-
-        if ( $id eq "0" ) {
-
-          # id "0" is an unrecognized accession
-
-          return;
-        }
-      }
-
       my @ids = split (',', $id);
       foreach $uid (@ids) {
         print "$uid\n";
@@ -3090,6 +3020,7 @@ sub elink {
     $link_help,
     "db=s" => \$db,
     "id=s" => \$id,
+    "format=s" => \$type,
     "target=s" => \$dbto,
     "name=s" => \$name,
     "related" => \$related,
@@ -3099,7 +3030,6 @@ sub elink {
     "batch" => \$batch,
     "holding=s" => \$holding,
     "label=s" => \$lbl,
-    "nogi" => \$nogi,
     "email=s" => \$emaddr,
     "tool=s" => \$tuul,
     "help" => \$help,
@@ -3191,20 +3121,6 @@ sub elink {
 
   if ( $dbase ne "" and $id ne "" ) {
 
-    if ($id !~ /^\d+$/ and $id !~ /,/) {
-
-      # convert single accession to GI number
-
-      $id = accn_to_gi ($dbase, $id, $nogi);
-
-      if ( $id eq "0" ) {
-
-        # id "0" is an unrecognized accession
-
-        return;
-      }
-    }
-
     # process db and id command-line arguments instead of getting from history
 
     $url = $base . $elink;
@@ -3218,6 +3134,9 @@ sub elink {
       $arg .= "&retmode=$mode";
     }
     $arg .= "&id=$id";
+    if ( $type eq "acc" ) {
+      $arg .= "&idtype=acc";
+    }
 
     $data = do_post ($url, $arg, $tool, $email, true);
 
@@ -3276,6 +3195,9 @@ sub elink {
       }
       $arg .= "&id=";
       $arg .= join ('&id=', @ids);
+      if ( $type eq "acc" ) {
+        $arg .= "&idtype=acc";
+      }
 
       $data = do_post ($url, $arg, $tool, $email, true);
 
@@ -3306,6 +3228,9 @@ sub elink {
   if ( $mode ne "" ) {
     $arg .= "&retmode=$mode";
   }
+  if ( $type eq "acc" ) {
+    $arg .= "&idtype=acc";
+  }
 
   $wb = $web;
   $ky = $key;
diff --git a/run-ncbi-converter b/run-ncbi-converter
new file mode 100755
index 0000000..d54f040
--- /dev/null
+++ b/run-ncbi-converter
@@ -0,0 +1,95 @@
+#!/usr/bin/perl -w
+use strict;
+
+use File::Path;
+use Net::FTP;
+use POSIX qw(uname);
+
+my $cache_dir = "$ENV{HOME}/.cache/ncbi-converters";
+if (defined $ENV{NCBI_CONVERTER_DIR}) {
+    $cache_dir = $ENV{NCBI_CONVERTER_DIR};
+}
+
+my $server   = 'ftp.ncbi.nlm.nih.gov';
+my $platform = DetectPlatform();
+my $dir      = "/toolbox/ncbi_tools/converters/by_platform/$platform";
+my $ext      = ($platform eq 'win') ? 'zip' : 'gz';
+my $binext   = ($platform eq 'win') ? '.exe' : '';
+my $archive  = "$ARGV[0].$platform.$ext";
+my $executable = "$cache_dir/" . $ARGV[0] . $binext;
+
+if ( ! -d $cache_dir ) {
+    File::Path::make_path($cache_dir)
+        or die "Unable to ensure the existence of $cache_dir: $!";
+}
+
+my $ftp = new Net::FTP($server, Passive => 1)
+    or die "Unable to connect to FTP server: $!";
+$ftp->login or die "Unable to log in to FTP server";
+$ftp->cwd($dir) or die "Unable to change to $dir";
+$ftp->binary or warn "Unable to set binary mode";
+
+my $time = $ftp->mdtm($archive);
+my @stats = stat "$cache_dir/$ARGV[0]$binext";
+if ( !@stats  ||  $stats[9] < $time) {
+    $ftp->get("$archive", "$cache_dir/$archive")
+        or die "Unable to download $archive";
+    utime $time, $time, "$cache_dir/$archive";
+    my $pid = fork();
+    if ($pid < 0) {
+        die "Unable to fork for unpacking: $!";
+    } elsif ($pid > 0) {
+        waitpid($pid, 0);
+        chmod(0777 &~ umask, $executable);
+        utime $time, $time, $executable;
+    } else {
+        chdir($cache_dir);
+        if ($platform eq 'win') {
+            exec('unzip', $archive);
+        } else {
+            system('gunzip', '-n', $archive);
+            rename("$ARGV[0].$platform", $ARGV[0]);
+            exit 0;
+        }
+    }
+}
+
+shift;
+exec($executable, @ARGV);
+
+sub DetectPlatform
+{
+    my @uname = uname();
+    my $OS    = $uname[0];
+    my $CPU   = $uname[4];
+    my $pf;
+    my $last_built;
+    if ($OS =~ /^CYGWIN/) {
+        $pf = 'win';
+    } elsif ($OS eq 'Darwin') {
+        $pf = 'mac';
+    } elsif ($OS eq 'Linux') {
+        if ($CPU =~ /i\d86/) {
+            $pf = 'linux32';
+            $last_built = 'November 2014';
+        } elsif ($CPU eq 'x86_64') {
+            $pf = 'linux64';
+        }
+    } elsif ($OS eq 'SunOS') {
+        if ($CPU =~ /^s/) {
+            $pf = 'solaris';
+            $last_built = 'March 2014';
+        } else {
+            $pf = 'solaris-x86';
+            $last_built = 'September 2014';
+        }
+    } else {
+        die "No prebuilt binaries available for $OS/$CPU";
+    }
+
+    if (defined $last_built) {
+        warn "NCBI no longer builds for $OS/$CPU; using a binary from"
+            . " $last_built";
+    }
+    return $pf;
+}
diff --git a/xtract.go b/xtract.go
index ef021ee..8ae50c6 100644
--- a/xtract.go
+++ b/xtract.go
@@ -51,6 +51,7 @@ import (
 	"io"
 	"math"
 	"os"
+	"path"
 	"runtime"
 	"runtime/debug"
 	"runtime/pprof"
@@ -63,7 +64,7 @@ import (
 
 // VERSION AND HELP MESSAGE TEXT
 
-const xtractVersion = "6.00"
+const xtractVersion = "6.10"
 
 const xtractHelp = `
 Overview
@@ -188,6 +189,7 @@ Phrase Processing
   -words           Split at punctuation marks
   -pairs           Adjacent informative words
   -phrase          Experimental index generation
+  -letters         Separate individual letters
 
 Sequence Coordinates
 
@@ -215,7 +217,7 @@ Miscellaneous
 
 Reformatting
 
-  -format          [compact|indent|expand]
+  -format          [compact|flush|indent|expand]
 
 Modification
 
@@ -314,11 +316,16 @@ Debugging
 
   -debug    Display run-time parameter summary
   -empty    Flag records with no output
-  -index    Print record index numbers
+  -ident    Print record index numbers
   -stats    Show processing time for each record
   -timer    Report processing duration and rate
   -trial    Optimize -proc value, requires -input
 
+Record Indexing
+
+  -index    Name of element to use for indexing
+  -local    Base path for individual XML files
+
 Internal Component Performance
 
   -chunk    ReadBlocks
@@ -692,6 +699,37 @@ Genome Range
   AKAP17A      A-kinase anchoring protein 17A
   ASMT         acetylserotonin O-methyltransferase
 
+3'UTR Sequences
+
+  ThreePrimeUTRs() {
+    xtract -pattern INSDSeq -ACC INSDSeq_accession-version -SEQ INSDSeq_sequence \
+      -block INSDFeature -if INSDFeature_key -equals CDS \
+        -pfc "\n" -element "&ACC" -rst -last INSDInterval_to -element "&SEQ" |
+    while read acc pos seq
+    do
+      if [ $pos -lt ${#seq} ]
+      then
+        echo -e ">$acc 3'UTR: $((pos+1))..${#seq}"
+        echo "${seq:$pos}" | fold -w 50
+      elif [ $pos -ge ${#seq} ]
+      then
+        echo -e ">$acc NO 3'UTR"
+      fi
+    done
+  }
+
+  esearch -db nuccore -query "5.5.1.19 [ECNO]" |
+  efilter -molecule mrna -source refseq |
+  efetch -format gbc | ThreePrimeUTRs
+
+  >NM_001328461.1 3'UTR: 1737..1871
+  gatgaatatagagttactgtgttgtaagctaatcatcatactgatgcaag
+  tgcattatcacatttacttctgctgatgattgttcataagattatgagtt
+  agccatttatcaaaaaaaaaaaaaaaaaaaaaaaa
+  >NM_001316759.1 3'UTR: 1628..1690
+  atccgagtaattcggaatcttgtccaattttatatagcctatattaatac
+  ...
+
 Amino Acid Substitutions
 
   ApplySNPs() {
@@ -849,6 +887,19 @@ Phrase Searching
     5-hydroxyindoleacetic acid
     5-hydroxytryptophan
     ...
+
+Mammalian Sequence Download
+
+  ftp-ls ftp.ncbi.nlm.nih.gov ncbi-asn1 |
+  grep -e gbmam -e gbpri -e gbrod |
+  xargs ftp-cp ftp.ncbi.nlm.nih.gov ncbi-asn1
+
+Human Subset Extraction
+
+  for fl in gbpri?.aso.gz gbpri??.aso.gz
+  do
+    run-ncbi-converter asn2all -i "$fl" -a t -b -c -O 9606 -f s > ${fl%.aso.gz}.xml
+  done
 `
 
 const pubMedArtSample = `
@@ -1506,6 +1557,7 @@ const (
 	WORDS
 	PAIRS
 	PHRASE
+	LETTERS
 	PFX
 	SFX
 	SEP
@@ -1647,6 +1699,7 @@ var argTypeIs = map[string]ArgumentType{
 	"-words":       EXTRACTION,
 	"-pairs":       EXTRACTION,
 	"-phrase":      EXTRACTION,
+	"-letters":     EXTRACTION,
 	"-num":         EXTRACTION,
 	"-len":         EXTRACTION,
 	"-sum":         EXTRACTION,
@@ -1691,6 +1744,7 @@ var opTypeIs = map[string]OpType{
 	"-words":       WORDS,
 	"-pairs":       PAIRS,
 	"-phrase":      PHRASE,
+	"-letters":     LETTERS,
 	"-pfx":         PFX,
 	"-sfx":         SFX,
 	"-sep":         SEP,
@@ -1936,6 +1990,8 @@ type Tables struct {
 	InElement [256]bool
 	ChanDepth int
 	FarmSize  int
+	Hd        string
+	Tl        string
 }
 
 type Node struct {
@@ -2669,7 +2725,7 @@ func ParseArguments(args []string, pttrn string) *Block {
 				op := &Operation{Type: status, Value: ""}
 				comm = append(comm, op)
 				status = UNSET
-			case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE:
+			case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, LETTERS:
 			case NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
 			case TAB, RET, PFX, SFX, SEP, LBL, PFC, DEF:
 			case UNSET:
@@ -2820,7 +2876,8 @@ func ParseArguments(args []string, pttrn string) *Block {
 			switch status {
 			case UNSET:
 				status = nextStatus(str)
-			case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
+			case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, LETTERS,
+				NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
 				for !strings.HasPrefix(str, "-") {
 					// create one operation per argument, even if under a single -element statement
 					op := &Operation{Type: status, Value: str}
@@ -2997,6 +3054,8 @@ type XMLReader struct {
 	Reader     io.Reader
 	Buffer     []byte
 	Remainder  string
+	Position   int64
+	Delta      int
 	Closed     bool
 	Docompress bool
 	Docleanup  bool
@@ -3053,6 +3112,10 @@ func (rdr *XMLReader) NextBlock() string {
 			return "", false, true
 		}
 
+		// keep track of file offset
+		rdr.Position += int64(rdr.Delta)
+		rdr.Delta = n
+
 		// slice of actual characters read
 		bufr := rdr.Buffer[:n+m]
 
@@ -3125,7 +3188,7 @@ func (rdr *XMLReader) NextBlock() string {
 // PARSE XML BLOCK STREAM INTO STRINGS FROM <PATTERN> TO </PATTERN>
 
 // PartitionPattern splits XML input by pattern and sends individual records to a callback
-func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string)) {
+func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, int64, string)) {
 
 	if pat == "" || rdr == nil || proc == nil {
 		return
@@ -3275,6 +3338,8 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
 		pos := 0
 		next := 0
 
+		offset := int64(0)
+
 		rec := 0
 
 		scr := newScanner(pat)
@@ -3298,6 +3363,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
 					if level == 0 {
 						inPattern = true
 						begin = pos
+						offset = rdr.Position + int64(pos)
 					}
 					level++
 				} else if match == STOPPATTERN {
@@ -3309,7 +3375,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
 						str := accumulator.String()
 						if str != "" {
 							rec++
-							proc(rec, str[:])
+							proc(rec, offset, str[:])
 						}
 						// reset accumulator
 						accumulator.Reset()
@@ -3343,6 +3409,8 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
 		pos := 0
 		next := 0
 
+		offset := int64(0)
+
 		rec := 0
 
 		scr := newScanner(pat)
@@ -3467,6 +3535,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
 					if level == 0 {
 						inPattern = true
 						begin = pos
+						offset = rdr.Position + int64(pos)
 					}
 					level++
 				} else if match == STOPPATTERN {
@@ -3478,7 +3547,7 @@ func PartitionPattern(pat, star string, rdr *XMLReader, proc func(int, string))
 						str := accumulator.String()
 						if str != "" {
 							rec++
-							proc(rec, str[:])
+							proc(rec, offset, str[:])
 						}
 						// reset accumulator
 						accumulator.Reset()
@@ -4357,6 +4426,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 		args = args[1:]
 
 		compRecrd := false
+		flushLeft := false
 		wrapAttrs := false
 		ret := "\n"
 		frst := true
@@ -4367,6 +4437,9 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 				// compress to one record per line
 				compRecrd = true
 				ret = ""
+			case "flush", "flushed", "left":
+				// suppress line indentation
+				flushLeft = true
 			case "expand", "expanded", "verbose", "@":
 				// each attribute on its own line
 				wrapAttrs = true
@@ -4417,7 +4490,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 
 		// function to indent a specified number of spaces
 		doIndent := func(indt int) {
-			if compRecrd {
+			if compRecrd || flushLeft {
 				return
 			}
 			i := indt
@@ -4698,7 +4771,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 		pat := args[1]
 
 		PartitionPattern(pat, "", in,
-			func(rec int, str string) {
+			func(rec int, ofs int64, str string) {
 				recordCount++
 				byteCount += len(str)
 			})
@@ -4728,7 +4801,7 @@ func ProcessXMLStream(in *XMLReader, tbls *Tables, args []string, action Special
 		sendPatterns := func(pat string, out chan<- string) {
 			defer close(out)
 			PartitionPattern(pat, "", in,
-				func(rec int, str string) {
+				func(rec int, ofs int64, str string) {
 					out <- str
 				})
 		}
@@ -5706,7 +5779,7 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
 			}
 
 			switch stat {
-			case ELEMENT, TERMS, WORDS, PAIRS, PHRASE, VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV:
+			case ELEMENT, TERMS, WORDS, PAIRS, PHRASE, LETTERS, VALUE, LEN, SUM, MIN, MAX, SUB, AVG, DEV:
 				exploreElements(func(str string, lvl int) {
 					if str != "" {
 						acc(str)
@@ -6027,6 +6100,17 @@ func ProcessClause(curr *Node, stages []*Step, mask, prev, pfx, sfx, sep, def st
 				buffer.WriteString("</Phrase>")
 			}
 		})
+	case LETTERS:
+		processElement(func(str string) {
+			if str != "" {
+				for _, ch := range str {
+					ok = true
+					buffer.WriteString(between)
+					buffer.WriteRune(ch)
+					between = sep
+				}
+			}
+		})
 	case LEN:
 		length := 0
 
@@ -6217,7 +6301,8 @@ func ProcessInstructions(commands []*Operation, curr *Node, mask, tab, ret strin
 		str := op.Value
 
 		switch op.Type {
-		case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
+		case ELEMENT, FIRST, LAST, ENCODE, UPPER, LOWER, TITLE, TERMS, WORDS, PAIRS, PHRASE, LETTERS,
+			NUM, LEN, SUM, MIN, MAX, INC, DEC, SUB, AVG, DEV, ZEROBASED, ONEBASED, UCSCBASED:
 			txt, ok := ProcessClause(curr, op.Stages, mask, tab, pfx, sfx, sep, def, op.Type, index, level, variables)
 			if ok {
 				tab = col
@@ -7045,7 +7130,7 @@ func ParseXML(Text, parent string, tbls *Tables) (*Node, bool) {
 }
 
 // ProcessQuery calls XML combined tokenizer parser on a partitioned string
-func ProcessQuery(text, parent, hd, tl string, index int, cmds *Block, tbls *Tables) string {
+func ProcessQuery(text, parent string, index int, cmds *Block, tbls *Tables) string {
 
 	if text == "" || cmds == nil || tbls == nil {
 		return ""
@@ -7065,8 +7150,8 @@ func ProcessQuery(text, parent, hd, tl string, index int, cmds *Block, tbls *Tab
 
 	ok = false
 
-	if hd != "" {
-		buffer.WriteString(hd[:])
+	if tbls.Hd != "" {
+		buffer.WriteString(tbls.Hd[:])
 	}
 
 	// start processing at top of command tree and top of XML subregion selected by -pattern
@@ -7078,8 +7163,8 @@ func ProcessQuery(text, parent, hd, tl string, index int, cmds *Block, tbls *Tab
 			}
 		})
 
-	if tl != "" {
-		buffer.WriteString(tl[:])
+	if tbls.Tl != "" {
+		buffer.WriteString(tbls.Tl[:])
 	}
 
 	if ret != "" {
@@ -7144,7 +7229,7 @@ func XMLProducer(pat, star string, rdr *XMLReader, out chan<- Extract) {
 
 	// partition all input by pattern and send XML substring to available consumer through channel
 	PartitionPattern(pat, star, rdr,
-		func(rec int, str string) {
+		func(rec int, ofs int64, str string) {
 			out <- Extract{rec, str}
 		})
 }
@@ -7168,7 +7253,7 @@ func CreateProducer(pat, star string, rdr *XMLReader, tbls *Tables) <-chan Extra
 }
 
 // XMLConsumer reads partitioned XML from channel and calls parser for processing
-func XMLConsumer(cmds *Block, tbls *Tables, parent, hd, tl string, wg *sync.WaitGroup, inp <-chan Extract, out chan<- Extract) {
+func XMLConsumer(cmds *Block, tbls *Tables, parent string, wg *sync.WaitGroup, inp <-chan Extract, out chan<- Extract) {
 
 	// report when this consumer has no more records to process
 	defer wg.Done()
@@ -7185,14 +7270,14 @@ func XMLConsumer(cmds *Block, tbls *Tables, parent, hd, tl string, wg *sync.Wait
 			continue
 		}
 
-		str := ProcessQuery(text[:], parent, hd, tl, idx, cmds, tbls)
+		str := ProcessQuery(text[:], parent, idx, cmds, tbls)
 
 		// send even if empty to get all record counts for reordering
 		out <- Extract{idx, str}
 	}
 }
 
-func CreateConsumers(cmds *Block, tbls *Tables, parent, hd, tl string, numServers int, inp <-chan Extract) <-chan Extract {
+func CreateConsumers(cmds *Block, tbls *Tables, parent string, numServers int, inp <-chan Extract) <-chan Extract {
 
 	if tbls == nil {
 		return nil
@@ -7209,7 +7294,7 @@ func CreateConsumers(cmds *Block, tbls *Tables, parent, hd, tl string, numServer
 	// launch multiple consumer goroutines
 	for i := 0; i < numServers; i++ {
 		wg.Add(1)
-		go XMLConsumer(cmds, tbls, parent, hd, tl, &wg, inp, out)
+		go XMLConsumer(cmds, tbls, parent, &wg, inp, out)
 	}
 
 	// launch separate anonymous goroutine to wait until all consumers are done, then close single output channel, so unshuffler can range over channel
@@ -7275,13 +7360,19 @@ func main() {
 	// debugging
 	dbug := false
 	mpty := false
-	indx := false
+	idnt := false
 	stts := false
 	timr := false
 
 	// profiling
 	prfl := false
 
+	// element to use as local data index
+	indx := ""
+
+	// path for local data indexed as trie
+	local := ""
+
 	// alternative source of sample record, processed a designated number of times, looping for each -proc from 1 to nCPU (undocumented)
 	testCount := 0
 	testType := ""
@@ -7356,6 +7447,24 @@ func main() {
 			fileName = args[1]
 			// skip past first of two arguments
 			args = args[1:]
+		// data element for indexing
+		case "-index":
+			if len(args) < 2 {
+				fmt.Fprintf(os.Stderr, "\nERROR: Index element is missing\n")
+				os.Exit(1)
+			}
+			indx = args[1]
+			// skip past first of two arguments
+			args = args[1:]
+		// data element for indexing
+		case "-local":
+			if len(args) < 2 {
+				fmt.Fprintf(os.Stderr, "\nERROR: Data path is missing\n")
+				os.Exit(1)
+			}
+			local = args[1]
+			// skip past first of two arguments
+			args = args[1:]
 		// data cleanup flags
 		case "-compress":
 			doCompress = true
@@ -7366,8 +7475,8 @@ func main() {
 			dbug = true
 		case "-empty":
 			mpty = true
-		case "-index":
-			indx = true
+		case "-ident":
+			idnt = true
 		case "-stats", "-stat":
 			stts = true
 		case "-timer":
@@ -7982,6 +8091,10 @@ func main() {
 		}
 	}
 
+	// per-record head and tail passed in master table
+	tbls.Hd = hd
+	tbls.Tl = tl
+
 	// ENSURE PRESENCE OF PATTERN ARGUMENT
 
 	if len(args) < 1 {
@@ -8023,6 +8136,134 @@ func main() {
 		os.Exit(1)
 	}
 
+	// SAVE XML COMPONENT RECORDS TO LOCAL DIRECTORY INDEXED BY TRIE ON IDENTIFIER
+
+	// -local plus -index saves XML files in trie-based directory structure (experimental)
+	if local != "" && indx != "" {
+
+		prnt, match := SplitInTwoAt(indx, "/", RIGHT)
+		match, attrib := SplitInTwoAt(match, "@", LEFT)
+
+		var idbuf bytes.Buffer
+
+		makeTrie := func(str string) string {
+			idbuf.Reset()
+			between := ""
+			for _, ch := range str {
+				if ch == '.' {
+					break
+				}
+				idbuf.WriteString(between)
+				idbuf.WriteRune(ch)
+				between = "/"
+			}
+			trie := idbuf.String()
+			return trie
+		}
+
+		PartitionPattern(topPattern, star, rdr,
+			func(rec int, ofs int64, str string) {
+				pat, ok := ParseXML(str[:], parent, tbls)
+				if !ok {
+					return
+				}
+				trie := ""
+				file := ""
+				num := 0
+				ExploreElements(pat, "", prnt, match, attrib, false, 1,
+					func(stn string, lvl int) {
+						num++
+						if stn != "" {
+							trie = makeTrie(stn)
+							file = stn
+						}
+					})
+				if trie == "" || file == "" || num != 1 {
+					return
+				}
+				dpath := path.Join(local, trie)
+				if dpath == "" {
+					return
+				}
+				_, err := os.Stat(dpath)
+				if err != nil && os.IsNotExist(err) {
+					err = os.MkdirAll(dpath, os.ModePerm)
+				}
+				if err != nil {
+					fmt.Println(err.Error())
+					return
+				}
+				fpath := path.Join(dpath, file+".xml")
+				if fpath == "" {
+					return
+				}
+				_, err = os.Stat(fpath)
+				if err != nil && os.IsNotExist(err) {
+					fl, err := os.Create(fpath)
+					if err != nil {
+						fmt.Println(err.Error())
+						return
+					}
+					fl.Close()
+				}
+				fl, err := os.OpenFile(fpath, os.O_RDWR, 0644)
+				if err != nil {
+					fmt.Println(err.Error())
+					return
+				}
+				fl.WriteString(str)
+				fl.WriteString("\n")
+				err = fl.Sync()
+				if err != nil {
+					fmt.Println(err.Error())
+				}
+				fl.Close()
+				fmt.Printf("%d\t%s\n", rec, fpath)
+			})
+
+		return
+	}
+
+	// GENERATE RECORD INDEX ON XML INPUT FILE
+
+	// -index command prints record identifier, file offset, and XML size
+	if indx != "" {
+
+		prnt, match := SplitInTwoAt(indx, "/", RIGHT)
+		match, attrib := SplitInTwoAt(match, "@", LEFT)
+
+		// legend := "REC\tID\tOFST\tSIZE"
+
+		PartitionPattern(topPattern, star, rdr,
+			func(rec int, ofs int64, str string) {
+				pat, ok := ParseXML(str[:], parent, tbls)
+				if !ok {
+					return
+				}
+				id := ""
+				num := 0
+				ExploreElements(pat, "", prnt, match, attrib, false, 1,
+					func(stn string, lvl int) {
+						num++
+						if stn != "" {
+							id = stn
+						}
+					})
+				if id == "" || num != 1 {
+					return
+				}
+				/*
+					if legend != "" {
+						fmt.Printf("%s\n", legend)
+						legend = ""
+					}
+				*/
+				fmt.Printf("%d\t%s\t%d\t%d\n", rec, id, ofs, len(str))
+			})
+
+		return
+	}
+
 	// PARSE AND VALIDATE EXTRACTION ARGUMENTS
 
 	// parse nested exploration instruction from command-line arguments
@@ -8037,12 +8278,12 @@ func main() {
 	// -stats with an extraction command prints XML size and processing time for each record
 	if stts {
 
-		legend := "REC\tSIZE\tTIME"
+		legend := "REC\tOFST\tSIZE\tTIME"
 
 		PartitionPattern(topPattern, star, rdr,
-			func(rec int, str string) {
+			func(rec int, ofs int64, str string) {
 				beginTime := time.Now()
-				ProcessQuery(str[:], parent, "", "", rec, cmds, tbls)
+				ProcessQuery(str[:], parent, rec, cmds, tbls)
 				endTime := time.Now()
 				duration := endTime.Sub(beginTime)
 				micro := int(float64(duration.Nanoseconds()) / 1e3)
@@ -8050,7 +8291,7 @@ func main() {
 					fmt.Printf("%s\n", legend)
 					legend = ""
 				}
-				fmt.Printf("%d\t%d\t%d\n", rec, len(str), micro)
+				fmt.Printf("%d\t%d\t%d\t%d\n", rec, ofs, len(str), micro)
 			})
 
 		return
@@ -8079,7 +8320,7 @@ func main() {
 				}
 				close(out)
 			}(xmlq)
-			tblq := CreateConsumers(cmds, tbls, parent, "", "", numServ, xmlq)
+			tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq)
 
 			if xmlq == nil || tblq == nil {
 				fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
@@ -8143,7 +8384,7 @@ func main() {
 				}
 
 				xmlq := CreateProducer(topPattern, star, rdr, tbls)
-				tblq := CreateConsumers(cmds, tbls, parent, hd, tl, numServ, xmlq)
+				tblq := CreateConsumers(cmds, tbls, parent, numServ, xmlq)
 
 				if xmlq == nil || tblq == nil {
 					fmt.Fprintf(os.Stderr, "\nERROR: Unable to create servers\n")
@@ -8201,7 +8442,7 @@ func main() {
 		if cmds.Position == "first" {
 
 			PartitionPattern(topPattern, star, rdr,
-				func(rec int, str string) {
+				func(rec int, ofs int64, str string) {
 					if rec == 1 {
 						qry = str
 						idx = rec
@@ -8211,7 +8452,7 @@ func main() {
 		} else if cmds.Position == "last" {
 
 			PartitionPattern(topPattern, star, rdr,
-				func(rec int, str string) {
+				func(rec int, ofs int64, str string) {
 					qry = str
 					idx = rec
 				})
@@ -8226,7 +8467,7 @@ func main() {
 			}
 
 			PartitionPattern(topPattern, star, rdr,
-				func(rec int, str string) {
+				func(rec int, ofs int64, str string) {
 					if rec == number {
 						qry = str
 						idx = rec
@@ -8242,7 +8483,7 @@ func main() {
 		cmds.Position = ""
 
 		// process single selected record
-		res := ProcessQuery(qry[:], parent, "", "", idx, cmds, tbls)
+		res := ProcessQuery(qry[:], parent, idx, cmds, tbls)
 		if res != "" {
 			fmt.Printf("%s\n", res)
 		}
@@ -8260,7 +8501,7 @@ func main() {
 	}
 
 	// launch consumer goroutines to parse and explore partitioned XML objects
-	tblq := CreateConsumers(cmds, tbls, parent, hd, tl, numServers, xmlq)
+	tblq := CreateConsumers(cmds, tbls, parent, numServers, xmlq)
 	if tblq == nil {
 		fmt.Fprintf(os.Stderr, "\nERROR: Unable to create consumers\n")
 		os.Exit(1)
@@ -8319,7 +8560,7 @@ func main() {
 		buffer.WriteString("\n")
 	}
 
-	// printResult prints output for current pattern, handles -empty and -index flags, and periodically flushes buffer
+	// printResult prints output for current pattern, handles -empty and -ident flags, and periodically flushes buffer
 	printResult := func(curr Extract) {
 
 		str := curr.Text
@@ -8342,7 +8583,7 @@ func main() {
 
 			okay = true
 
-			if indx {
+			if idnt {
 				idx := curr.Index
 				val := strconv.Itoa(idx)
 				buffer.WriteString(val[:])

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ncbi-entrez-direct.git



More information about the debian-med-commit mailing list