[Pkg-javascript-commits] [node-regjsparser] 01/07: New upstream version 0.3.0+ds

Mon Feb 19 23:44:05 UTC 2018

This is an automated email from the git hooks/post-receive script.

jpuydt-guest pushed a commit to branch master
in repository node-regjsparser.

commit 8e788f1e58e9eccbe3fa019ac265efa52bc0f621
Author: Julien Puydt <julien.puydt at laposte.net>
Date:   Thu Jan 18 08:44:49 2018 +0100

    New upstream version 0.3.0+ds
---
 README.md                          |  10 ++
 parser.js                          | 200 ++++++++++++++++++++++++++++++++++---
 tools/generate-identifier-regex.js |  20 ++--
 3 files changed, 205 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 83f67e9..8ed945c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,16 @@ var parse = require('regjsparser').parse;
 
 var parseTree = parse('^a'); // /^a/
 console.log(parseTree);
+
+// Toggle on/off additional features:
+var parseTree = parse('^a', {
+  // SEE: https://github.com/jviereck/regjsparser/pull/78
+  unicodePropertyEscape: true,
+
+  // SEE: https://github.com/jviereck/regjsparser/pull/83
+  namedGroups: true
+});
+console.log(parseTree);
 ```
 
 ## Testing
diff --git a/parser.js b/parser.js
index 22a054a..93ff428 100644
--- a/parser.js
+++ b/parser.js
@@ -48,7 +48,7 @@
 //      .
 //      \ AtomEscape
 //      CharacterClass
-//      ( Disjunction )
+//      ( GroupSpecifier Disjunction )
 //      ( ? : Disjunction )
 //
 // PatternCharacter ::
@@ -58,6 +58,7 @@
 //      DecimalEscape
 //      CharacterEscape
 //      CharacterClassEscape
+//      k GroupName
 //
 // CharacterEscape[U] ::
 //      ControlEscape
@@ -115,9 +116,80 @@
 //      b
 //      CharacterEscape
 //      CharacterClassEscape
+//
+// GroupSpecifier ::
+//      [empty]
+//      ? GroupName
+//
+// GroupName ::
+//      < RegExpIdentifierName >
+//
+// RegExpIdentifierName ::
+//      RegExpIdentifierStart
+//      RegExpIdentifierName RegExpIdentifierContinue
+//
+// RegExpIdentifierStart ::
+//      UnicodeIDStart
+//      $
+//      _
+//      \ RegExpUnicodeEscapeSequence
+//
+// RegExpIdentifierContinue ::
+//      UnicodeIDContinue
+//      $
+//      _
+//      \ RegExpUnicodeEscapeSequence
+//      <ZWNJ>
+//      <ZWJ>
 
 (function() {
 
+  var fromCodePoint = String.fromCodePoint || (function() {
+    // Implementation taken from
+    // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint
+
+    var stringFromCharCode = String.fromCharCode;
+    var floor = Math.floor;
+
+    return function fromCodePoint() {
+      var MAX_SIZE = 0x4000;
+      var codeUnits = [];
+      var highSurrogate;
+      var lowSurrogate;
+      var index = -1;
+      var length = arguments.length;
+      if (!length) {
+        return '';
+      }
+      var result = '';
+      while (++index < length) {
+        var codePoint = Number(arguments[index]);
+        if (
+          !isFinite(codePoint) ||       // `NaN`, `+Infinity`, or `-Infinity`
+          codePoint < 0 ||              // not a valid Unicode code point
+          codePoint > 0x10FFFF ||       // not a valid Unicode code point
+          floor(codePoint) != codePoint // not an integer
+        ) {
+          throw RangeError('Invalid code point: ' + codePoint);
+        }
+        if (codePoint <= 0xFFFF) { // BMP code point
+          codeUnits.push(codePoint);
+        } else { // Astral code point; split in surrogate halves
+          // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
+          codePoint -= 0x10000;
+          highSurrogate = (codePoint >> 10) + 0xD800;
+          lowSurrogate = (codePoint % 0x400) + 0xDC00;
+          codeUnits.push(highSurrogate, lowSurrogate);
+        }
+        if (index + 1 == length || codeUnits.length > MAX_SIZE) {
+          result += stringFromCharCode.apply(null, codeUnits);
+          codeUnits.length = 0;
+        }
+      }
+      return result;
+    };
+  }());
+
   function parse(str, flags, features) {
     if (!features) {
       features = {};
@@ -220,6 +292,17 @@
       });
     }
 
+    function createNamedReference(name) {
+      return addRaw({
+        type: 'reference',
+        name: name,
+        range: [
+          name.range[0] - 3,
+          pos
+        ]
+      });
+    }
+
     function createGroup(behavior, disjunction, from, to) {
       return addRaw({
         type: 'group',
@@ -424,6 +507,10 @@
         return false;
       }
 
+      return finishGroup(type, from);
+    }
+
+    function finishGroup(type, from) {
       var body = parseDisjunction();
       if (!body) {
         bail('Expected disjunction');
@@ -524,7 +611,7 @@
       //      .
       //      \ AtomEscape
       //      CharacterClass
-      //      ( Disjunction )
+      //      ( GroupSpecifier Disjunction )
       //      ( ? : Disjunction )
 
       var res;
@@ -551,6 +638,13 @@
       else if (res = parseCharacterClass()) {
         return res;
       }
+      else if (features.namedGroups && match("(?<")) {
+        var name = parseIdentifier();
+        skip(">");
+        var group = finishGroup("normal", name.range[0] - 3);
+        group.name = name;
+        return group;
+      }
       else {
         //      ( Disjunction )
         //      ( ? : Disjunction )
@@ -593,10 +687,11 @@
       //      DecimalEscape
       //      CharacterEscape
       //      CharacterClassEscape
+      //      k GroupName
 
       var res, from = pos;
 
-      res = parseDecimalEscape();
+      res = parseDecimalEscape() || parseNamedReference();
       if (res) {
         return res;
       }
@@ -681,6 +776,27 @@
       return false;
     }
 
+    function parseNamedReference() {
+      if (features.namedGroups && matchReg(/^k<(?=.*?>)/)) {
+        var name = parseIdentifier();
+        skip('>');
+        return createNamedReference(name);
+      }
+    }
+
+    function parseRegExpUnicodeEscapeSequence() {
+      var res;
+      if (res = matchReg(/^u([0-9a-fA-F]{4})/)) {
+        // UnicodeEscapeSequence
+        return parseUnicodeSurrogatePairEscape(
+          createEscaped('unicodeEscape', parseInt(res[1], 16), res[1], 2)
+        );
+      } else if (hasUnicodeFlag && (res = matchReg(/^u\{([0-9a-fA-F]+)\}/))) {
+        // RegExpUnicodeEscapeSequence (ES6 Unicode code point escape)
+        return createEscaped('unicodeCodePointEscape', parseInt(res[1], 16), res[1], 4);
+      }
+    }
+
     function parseCharacterEscape() {
       // CharacterEscape ::
       //      ControlEscape
@@ -707,14 +823,8 @@
       } else if (res = matchReg(/^x([0-9a-fA-F]{2})/)) {
         // HexEscapeSequence
         return createEscaped('hexadecimalEscape', parseInt(res[1], 16), res[1], 2);
-      } else if (res = matchReg(/^u([0-9a-fA-F]{4})/)) {
-        // UnicodeEscapeSequence
-        return parseUnicodeSurrogatePairEscape(
-          createEscaped('unicodeEscape', parseInt(res[1], 16), res[1], 2)
-        );
-      } else if (hasUnicodeFlag && (res = matchReg(/^u\{([0-9a-fA-F]+)\}/))) {
-        // RegExpUnicodeEscapeSequence (ES6 Unicode code point escape)
-        return createEscaped('unicodeCodePointEscape', parseInt(res[1], 16), res[1], 4);
+      } else if (res = parseRegExpUnicodeEscapeSequence()) {
+        return res;
       } else if (features.unicodePropertyEscape && hasUnicodeFlag && (res = matchReg(/^([pP])\{([^\}]+)\}/))) {
         // https://github.com/jviereck/regjsparser/issues/77
         return addRaw({
@@ -730,17 +840,77 @@
       }
     }
 
-    // Taken from the Esprima parser.
-    function isIdentifierPart(ch) {
+    function parseIdentifierAtom(check) {
+      var ch = lookahead();
+      var from = pos;
+      if (!check(ch.charCodeAt(0))) return;
+      incr();
+      if (ch === '\\') {
+        var esc = parseRegExpUnicodeEscapeSequence();
+        if (!esc || !check(esc.codePoint)) {
+          bail('Invalid escape sequence', null, from, pos);
+        }
+        return fromCodePoint(esc.codePoint);
+      }
+      return ch;
+    }
+
+    function parseIdentifier() {
+      // RegExpIdentifierName ::
+      //      RegExpIdentifierStart
+      //      RegExpIdentifierName RegExpIdentifierContinue
+      //
+      // RegExpIdentifierStart ::
+      //      UnicodeIDStart
+      //      $
+      //      _
+      //      \ RegExpUnicodeEscapeSequence
+      //
+      // RegExpIdentifierContinue ::
+      //      UnicodeIDContinue
+      //      $
+      //      _
+      //      \ RegExpUnicodeEscapeSequence
+      //      <ZWNJ>
+      //      <ZWJ>
+
+      var start = pos;
+      var res = parseIdentifierAtom(isIdentifierStart);
+      if (!res) {
+        bail('Invalid identifier');
+      }
+
+      var ch;
+      while (ch = parseIdentifierAtom(isIdentifierPart)) {
+        res += ch;
+      }
+      
+      return addRaw({
+        type: 'identifier',
+        value: res,
+        range: [start, pos]
+      });
+    }
+
+    function isIdentifierStart(ch) {
       // Generated by `tools/generate-identifier-regex.js`.
-      var NonAsciiIdentifierPart = new RegExp('[\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0300-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u0483-\u0487\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u05D0-\u05EA\u05F0-\u05F2\u0610-\u061A\u0620-\u0669\u066E-\u06D3\u06D5-\u06DC\u06DF-\u06E8\u06EA-\u06FC\u06FF\u0710-\u074A\u074D-\u07B1\u07C0-\u07F5\ [...]
+      var NonAsciiIdentifierStart = /[\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u [...]
 
       return (ch === 36) || (ch === 95) ||  // $ (dollar) and _ (underscore)
         (ch >= 65 && ch <= 90) ||         // A..Z
         (ch >= 97 && ch <= 122) ||        // a..z
         (ch >= 48 && ch <= 57) ||         // 0..9
         (ch === 92) ||                    // \ (backslash)
-        ((ch >= 0x80) && NonAsciiIdentifierPart.test(String.fromCharCode(ch)));
+        ((ch >= 0x80) && NonAsciiIdentifierStart.test(String.fromCharCode(ch)));
+    }
+
+    // Taken from the Esprima parser.
+    function isIdentifierPart(ch) {
+      // Generated by `tools/generate-identifier-regex.js`.
+      var NonAsciiIdentifierPartOnly = /[\u0300-\u036F\u0483-\u0487\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u0669\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u06F0-\u06F9\u0711\u0730-\u074A\u07A6-\u07B0\u07C0-\u07C9\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08E4-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0966-\u096F\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u09E [...]
+
+      return isIdentifierStart(ch) ||
+        ((ch >= 0x80) && NonAsciiIdentifierPartOnly.test(String.fromCharCode(ch)));
     }
 
     function parseIdentityEscape() {
diff --git a/tools/generate-identifier-regex.js b/tools/generate-identifier-regex.js
index bd06403..af1fc06 100644
--- a/tools/generate-identifier-regex.js
+++ b/tools/generate-identifier-regex.js
@@ -28,24 +28,24 @@ var generateES5Regex = function() { // ES 5.1
     .add(Lu, Ll, Lt, Lm, Lo, Nl)
     .removeRange(0x010000, 0x10FFFF) // remove astral symbols
     .removeRange(0x0, 0x7F); // remove ASCII symbols (regjsparser-specific)
-  var identifierPart = identifierStart.clone()
-    .add('\u200C', '\u200D', Mn, Mc, Nd, Pc)
+  var identifierPartOnly = regenerate('\u200C', '\u200D')
+    .add(Mn, Mc, Nd, Pc)
     .removeRange(0x010000, 0x10FFFF) // remove astral symbols
     .removeRange(0x0, 0x7F); // remove ASCII symbols (regjsparser-specific)
   return {
     'NonAsciiIdentifierStart': identifierStart.toString(),
-    'NonAsciiIdentifierPart': identifierPart.toString()
+    'NonAsciiIdentifierPartOnly': identifierPartOnly.toString()
   };
 };
 
 var result = generateES5Regex();
-// console.log(
-//   '// ECMAScript 5.1/Unicode v%s NonAsciiIdentifierStart:\n\n%s\n',
-//   version,
-//   result.NonAsciiIdentifierStart
-// );
 console.log(
-  '// ECMAScript 5.1/Unicode v%s NonAsciiIdentifierPart:\n\n%s',
+  '// ECMAScript 5.1/Unicode v%s NonAsciiIdentifierStart:\n\n%s\n',
   version,
-  result.NonAsciiIdentifierPart
+  result.NonAsciiIdentifierStart
+);
+console.log(
+  '// ECMAScript 5.1/Unicode v%s NonAsciiIdentifierPartOnly:\n\n%s',
+  version,
+  result.NonAsciiIdentifierPartOnly
 );

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-javascript/node-regjsparser.git