[jruby-joni] 41/194: second batch for ascii/unicode modifiers

Thu Feb 1 12:04:19 UTC 2018

This is an automated email from the git hooks/post-receive script.

henrich pushed a commit to branch debian/sid
in repository jruby-joni.

commit 4e3b5cc632f256304e0c143a101d8558dd7068bf
Author: Marcin Mielzynski <lopx at gazeta.pl>
Date:   Fri Dec 29 12:39:10 2017 +0100

    second batch for ascii/unicode modifiers
---
 src/org/joni/Analyser.java        |  6 +----
 src/org/joni/ArrayCompiler.java   | 42 +++++++++++++++++++++++++------
 src/org/joni/ByteCodeMachine.java | 53 +++++++++++++++++++++++++++++++++++++++
 src/org/joni/Matcher.java         |  6 +++++
 src/org/joni/Parser.java          |  2 +-
 test/org/joni/test/TestU8.java    | 42 +++++++++++++++++++++++++++++++
 6 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/src/org/joni/Analyser.java b/src/org/joni/Analyser.java
index b3458b7..9da07f7 100644
--- a/src/org/joni/Analyser.java
+++ b/src/org/joni/Analyser.java
@@ -742,10 +742,6 @@ final class Analyser extends Parser {
         return len;
     }
 
-    boolean isMbcAsciiWord(Encoding enc, byte[]bytes, int p, int end) { // ONIGENC_IS_MBC_ASCII_WORD
-        return ASCIIEncoding.INSTANCE.isCodeCType(enc.mbcToCode(bytes, p, end), CharacterType.WORD);
-    }
-
     /* x is not included y ==>  1 : 0 */
     private boolean isNotIncluded(Node x, Node y) {
         Node tmp;
@@ -868,7 +864,7 @@ final class Analyser extends Parser {
                 switch (cy.ctype) {
                 case CharacterType.WORD:
                     if (cy.asciiRange) {
-                        if (isMbcAsciiWord(enc, xs.bytes, xs.p, xs.end)) {
+                        if (Matcher.isMbcAsciiWord(enc, xs.bytes, xs.p, xs.end)) {
                             return cy.not;
                         } else {
                             return !cy.not;
diff --git a/src/org/joni/ArrayCompiler.java b/src/org/joni/ArrayCompiler.java
index 88f1e0e..fb3c464 100644
--- a/src/org/joni/ArrayCompiler.java
+++ b/src/org/joni/ArrayCompiler.java
@@ -322,9 +322,17 @@ final class ArrayCompiler extends Compiler {
         switch (cn.ctype) {
         case CharacterType.WORD:
             if (cn.not) {
-                op = enc.isSingleByte() ? OPCode.NOT_WORD_SB : OPCode.NOT_WORD;
+                if (cn.asciiRange) {
+                    op = OPCode.NOT_ASCII_WORD;
+                } else {
+                    op = enc.isSingleByte() ? OPCode.NOT_WORD_SB : OPCode.NOT_WORD;
+                }
             } else {
-                op = enc.isSingleByte() ? OPCode.WORD_SB : OPCode.WORD;
+                if (cn.asciiRange) {
+                    op = OPCode.ASCII_WORD;
+                } else {
+                    op = enc.isSingleByte() ? OPCode.WORD_SB : OPCode.WORD;
+                }
             }
             break;
 
@@ -1021,21 +1029,39 @@ final class ArrayCompiler extends Compiler {
         case AnchorType.BEGIN_POSITION:     addOpcode(OPCode.BEGIN_POSITION);       break;
 
         case AnchorType.WORD_BOUND:
-            addOpcode(enc.isSingleByte() ? OPCode.WORD_BOUND_SB : OPCode.WORD_BOUND);
+            if (node.asciiRange) {
+                addOpcode(OPCode.ASCII_WORD_BOUND);
+            } else {
+                addOpcode(enc.isSingleByte() ? OPCode.WORD_BOUND_SB : OPCode.WORD_BOUND);
+            }
             break;
 
         case AnchorType.NOT_WORD_BOUND:
-            addOpcode(enc.isSingleByte() ? OPCode.NOT_WORD_BOUND_SB : OPCode.NOT_WORD_BOUND);
+            if (node.asciiRange) {
+                addOpcode(OPCode.NOT_ASCII_WORD_BOUND);
+            } else {
+                addOpcode(enc.isSingleByte() ? OPCode.NOT_WORD_BOUND_SB : OPCode.NOT_WORD_BOUND);
+            }
             break;
 
         case AnchorType.WORD_BEGIN:
-            if (Config.USE_WORD_BEGIN_END)
-                addOpcode(enc.isSingleByte() ? OPCode.WORD_BEGIN_SB : OPCode.WORD_BEGIN);
+            if (Config.USE_WORD_BEGIN_END) {
+                if (node.asciiRange) {
+                    addOpcode(OPCode.ASCII_WORD_BEGIN);
+                } else {
+                    addOpcode(enc.isSingleByte() ? OPCode.WORD_BEGIN_SB : OPCode.WORD_BEGIN);
+                }
+            }
             break;
 
         case AnchorType.WORD_END:
-            if (Config.USE_WORD_BEGIN_END)
-                addOpcode(enc.isSingleByte() ? OPCode.WORD_END_SB : OPCode.WORD_END);
+            if (Config.USE_WORD_BEGIN_END) {
+                if (node.asciiRange) {
+                    addOpcode(OPCode.ASCII_WORD_END);
+                } else {
+                    addOpcode(enc.isSingleByte() ? OPCode.WORD_END_SB : OPCode.WORD_END);
+                }
+            }
             break;
 
         case AnchorType.KEEP:
diff --git a/src/org/joni/ByteCodeMachine.java b/src/org/joni/ByteCodeMachine.java
index 10dc26e..f0f761f 100644
--- a/src/org/joni/ByteCodeMachine.java
+++ b/src/org/joni/ByteCodeMachine.java
@@ -241,6 +241,13 @@ class ByteCodeMachine extends StackMachine {
                 case OPCode.WORD_BEGIN:                 opWordBegin();             continue;
                 case OPCode.WORD_END:                   opWordEnd();               continue;
 
+                case OPCode.ASCII_WORD:                 opAsciiWord();             break;
+                case OPCode.NOT_ASCII_WORD:             opNotAsciiWord();          break;
+                case OPCode.ASCII_WORD_BOUND:           opAsciiWordBound();        break;
+                case OPCode.NOT_ASCII_WORD_BOUND:       opNotAsciiWordBound();     continue;
+                case OPCode.ASCII_WORD_BEGIN:           opAsciiWordBegin();        continue;
+                case OPCode.ASCII_WORD_END:             opAsciiWordEnd();          continue;
+
                 case OPCode.BEGIN_BUF:                  opBeginBuf();              continue;
                 case OPCode.END_BUF:                    opEndBuf();                continue;
                 case OPCode.BEGIN_LINE:                 opBeginLine();             continue;
@@ -1031,6 +1038,12 @@ class ByteCodeMachine extends StackMachine {
         sprev = sbegin; // break;
     }
 
+    private void opAsciiWord() {
+        if (s >= range || !isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+        s += enc.length(bytes, s, end);
+        sprev = sbegin; // break;
+    }
+
     private void opNotWord() {
         if (s >= range || enc.isMbcWord(bytes, s, end)) {opFail(); return;}
         s += enc.length(bytes, s, end);
@@ -1043,6 +1056,12 @@ class ByteCodeMachine extends StackMachine {
         sprev = sbegin; // break;
     }
 
+    private void opNotAsciiWord() {
+        if (s >= range || isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+        s += enc.length(bytes, s, end);
+        sprev = sbegin; // break;
+    }
+
     private void opWordBound() {
         if (s == str) {
             if (s >= range || !enc.isMbcWord(bytes, s, end)) {opFail(); return;}
@@ -1063,6 +1082,16 @@ class ByteCodeMachine extends StackMachine {
         }
     }
 
+    private void opAsciiWordBound() {
+        if (s == str) {
+            if (s >= range || !isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+        } else if (s == end) {
+            if (sprev >= end || !isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+        } else {
+            if (isMbcAsciiWord(enc, bytes, s, end) == isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+        }
+    }
+
     private void opNotWordBound() {
         if (s == str) {
             if (s < range && enc.isMbcWord(bytes, s, end)) {opFail(); return;}
@@ -1083,6 +1112,16 @@ class ByteCodeMachine extends StackMachine {
         }
     }
 
+    private void opNotAsciiWordBound() {
+        if (s == str) {
+            if (s < range && isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+        } else if (s == end) {
+            if (sprev < end && isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+        } else {
+            if (isMbcAsciiWord(enc, bytes, s, end) != isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+        }
+    }
+
     private void opWordBegin() {
         if (s < range && enc.isMbcWord(bytes, s, end)) {
             if (s == str || !enc.isMbcWord(bytes, sprev, end)) return;
@@ -1097,6 +1136,13 @@ class ByteCodeMachine extends StackMachine {
         opFail();
     }
 
+    private void opAsciiWordBegin() {
+        if (s < range && isMbcAsciiWord(enc, bytes, s, end)) {
+            if (s == str || !isMbcAsciiWord(enc, bytes, sprev, end)) return;
+        }
+        opFail();
+    }
+
     private void opWordEnd() {
         if (s != str && enc.isMbcWord(bytes, sprev, end)) {
             if (s == end || !enc.isMbcWord(bytes, s, end)) return;
@@ -1111,6 +1157,13 @@ class ByteCodeMachine extends StackMachine {
         opFail();
     }
 
+    private void opAsciiWordEnd() {
+        if (s != str && isMbcAsciiWord(enc, bytes, sprev, end)) {
+            if (s == end || !isMbcAsciiWord(enc, bytes, s, end)) return;
+        }
+        opFail();
+    }
+
     private void opBeginBuf() {
         if (s != str) opFail();
     }
diff --git a/src/org/joni/Matcher.java b/src/org/joni/Matcher.java
index 5b623fe..0d4c767 100644
--- a/src/org/joni/Matcher.java
+++ b/src/org/joni/Matcher.java
@@ -24,6 +24,8 @@ import static org.joni.Option.isFindLongest;
 
 import org.jcodings.Encoding;
 import org.jcodings.IntHolder;
+import org.jcodings.constants.CharacterType;
+import org.jcodings.specific.ASCIIEncoding;
 import org.joni.constants.AnchorType;
 
 public abstract class Matcher extends IntHolder {
@@ -592,4 +594,8 @@ public abstract class Matcher extends IntHolder {
         return icbuf == null ? icbuf = new byte[Config.ENC_MBC_CASE_FOLD_MAXLEN] : icbuf;
     }
 
+    static boolean isMbcAsciiWord(Encoding enc, byte[]bytes, int p, int end) { // ONIGENC_IS_MBC_ASCII_WORD
+        return ASCIIEncoding.INSTANCE.isCodeCType(enc.mbcToCode(bytes, p, end), CharacterType.WORD);
+    }
+
 }
diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java
index dfd21e2..9ac56fd 100644
--- a/src/org/joni/Parser.java
+++ b/src/org/joni/Parser.java
@@ -162,7 +162,7 @@ class Parser extends Lexer {
             neg = false;
         }
 
-        if (token.type == TokenType.CC_CLOSE) {
+        if (token.type == TokenType.CC_CLOSE && !syntax.op2OptionECMAScript()) {
             if (!codeExistCheck(']', true)) newSyntaxException(ERR_EMPTY_CHAR_CLASS);
             env.ccEscWarn("]");
             token.type = TokenType.CHAR; /* allow []...] */
diff --git a/test/org/joni/test/TestU8.java b/test/org/joni/test/TestU8.java
index 319b22c..fb69b45 100755
--- a/test/org/joni/test/TestU8.java
+++ b/test/org/joni/test/TestU8.java
@@ -203,10 +203,52 @@ public class TestU8 extends Test {
         x2s("(?i)АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "абвгдеёжзийклмнопрстуфхцчшщъыьэюя", 0, 33 * 2);
         x2s("(?i)АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", 0, 33 * 2);
 
+        x2s("(?u)\\w+", "あa#", 0, 4);
+        x2s("(?a)\\w+", "あa#", 3, 4);
+        x2s("(?u)\\W+", "あa#", 4, 5);
+        x2s("(?a)\\W+", "あa#", 0, 3);
+
+        x2s("(?a)\\b", "あa", 3, 3);
+        x2s("(?a)\\w\\b", "aあ", 0, 1);
+        x2s("(?a)\\B", "a ああ ", 2, 2);
+
+        x2s("(?u)\\B", "あ ", 4, 4);
+        x2s("(?a)\\B", "あ ", 0, 0);
+        x2s("(?a)\\B", "aあ ", 4, 4);
+
+        x2s("(?a)a\\b", " a", 1, 2);
+        x2s("(?u)a\\b", " a", 1, 2);
+        ns("(?a)a\\B", " a");
+        ns("(?a)あ\\b", " あ");
+        x2s("(?u)あ\\b", " あ", 1, 4);
+        x2s("(?a)あ\\B", " あ", 1, 4);
+        ns("(?u)あ\\B", " あ");
+
+        x2s("(?a)\\p{Alpha}\\P{Alpha}", "a。", 0, 4);
+        x2s("(?u)\\p{Alpha}\\P{Alpha}", "a。", 0, 4);
+        x2s("(?a)[[:word:]]+", "aあ", 0, 1);
+        x2s("(?a)[[:^word:]]+", "aあ", 1, 4);
+        x2s("(?u)[[:word:]]+", "aあ", 0, 4);
+        ns("(?u)[[:^word:]]+", "aあ");
+
         x2s("(?iu)\\p{lower}\\p{upper}", "Ab", 0, 2);
         x2s("(?ia)\\p{lower}\\p{upper}", "Ab", 0, 2);
         x2s("(?iu)[[:lower:]][[:upper:]]", "Ab", 0, 2);
         x2s("(?ia)[[:lower:]][[:upper:]]", "Ab", 0, 2);
+
+        ns("(?ia)\\w+", "\u212a\u017f");
+        ns("(?ia)[\\w]+", "\u212a\u017f");
+        ns("(?ia)[^\\W]+", "\u212a\u017f");
+        x2s("(?ia)[^\\W]+", "ks", 0, 2);
+        ns("(?iu)\\p{ASCII}", "\u212a");
+        ns("(?iu)\\P{ASCII}", "s");
+        ns("(?iu)[\\p{ASCII}]", "\u212a");
+        ns("(?iu)[\\P{ASCII}]", "s");
+        ns("(?ia)\\p{ASCII}", "\u212a");
+        ns("(?ia)\\P{ASCII}", "s");
+        ns("(?ia)[\\p{ASCII}]", "\u212a");
+        ns("(?ia)[\\P{ASCII}]", "s");
+
         super.test();
     }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git