[jruby-joni] 41/194: second batch for ascii/unicode modifiers
Hideki Yamane
henrich at moszumanska.debian.org
Thu Feb 1 12:04:19 UTC 2018
This is an automated email from the git hooks/post-receive script.
henrich pushed a commit to branch debian/sid
in repository jruby-joni.
commit 4e3b5cc632f256304e0c143a101d8558dd7068bf
Author: Marcin Mielzynski <lopx at gazeta.pl>
Date: Fri Dec 29 12:39:10 2017 +0100
second batch for ascii/unicode modifiers
---
src/org/joni/Analyser.java | 6 +----
src/org/joni/ArrayCompiler.java | 42 +++++++++++++++++++++++++------
src/org/joni/ByteCodeMachine.java | 53 +++++++++++++++++++++++++++++++++++++++
src/org/joni/Matcher.java | 6 +++++
src/org/joni/Parser.java | 2 +-
test/org/joni/test/TestU8.java | 42 +++++++++++++++++++++++++++++++
6 files changed, 137 insertions(+), 14 deletions(-)
diff --git a/src/org/joni/Analyser.java b/src/org/joni/Analyser.java
index b3458b7..9da07f7 100644
--- a/src/org/joni/Analyser.java
+++ b/src/org/joni/Analyser.java
@@ -742,10 +742,6 @@ final class Analyser extends Parser {
return len;
}
- boolean isMbcAsciiWord(Encoding enc, byte[]bytes, int p, int end) { // ONIGENC_IS_MBC_ASCII_WORD
- return ASCIIEncoding.INSTANCE.isCodeCType(enc.mbcToCode(bytes, p, end), CharacterType.WORD);
- }
-
/* x is not included y ==> 1 : 0 */
private boolean isNotIncluded(Node x, Node y) {
Node tmp;
@@ -868,7 +864,7 @@ final class Analyser extends Parser {
switch (cy.ctype) {
case CharacterType.WORD:
if (cy.asciiRange) {
- if (isMbcAsciiWord(enc, xs.bytes, xs.p, xs.end)) {
+ if (Matcher.isMbcAsciiWord(enc, xs.bytes, xs.p, xs.end)) {
return cy.not;
} else {
return !cy.not;
diff --git a/src/org/joni/ArrayCompiler.java b/src/org/joni/ArrayCompiler.java
index 88f1e0e..fb3c464 100644
--- a/src/org/joni/ArrayCompiler.java
+++ b/src/org/joni/ArrayCompiler.java
@@ -322,9 +322,17 @@ final class ArrayCompiler extends Compiler {
switch (cn.ctype) {
case CharacterType.WORD:
if (cn.not) {
- op = enc.isSingleByte() ? OPCode.NOT_WORD_SB : OPCode.NOT_WORD;
+ if (cn.asciiRange) {
+ op = OPCode.NOT_ASCII_WORD;
+ } else {
+ op = enc.isSingleByte() ? OPCode.NOT_WORD_SB : OPCode.NOT_WORD;
+ }
} else {
- op = enc.isSingleByte() ? OPCode.WORD_SB : OPCode.WORD;
+ if (cn.asciiRange) {
+ op = OPCode.ASCII_WORD;
+ } else {
+ op = enc.isSingleByte() ? OPCode.WORD_SB : OPCode.WORD;
+ }
}
break;
@@ -1021,21 +1029,39 @@ final class ArrayCompiler extends Compiler {
case AnchorType.BEGIN_POSITION: addOpcode(OPCode.BEGIN_POSITION); break;
case AnchorType.WORD_BOUND:
- addOpcode(enc.isSingleByte() ? OPCode.WORD_BOUND_SB : OPCode.WORD_BOUND);
+ if (node.asciiRange) {
+ addOpcode(OPCode.ASCII_WORD_BOUND);
+ } else {
+ addOpcode(enc.isSingleByte() ? OPCode.WORD_BOUND_SB : OPCode.WORD_BOUND);
+ }
break;
case AnchorType.NOT_WORD_BOUND:
- addOpcode(enc.isSingleByte() ? OPCode.NOT_WORD_BOUND_SB : OPCode.NOT_WORD_BOUND);
+ if (node.asciiRange) {
+ addOpcode(OPCode.NOT_ASCII_WORD_BOUND);
+ } else {
+ addOpcode(enc.isSingleByte() ? OPCode.NOT_WORD_BOUND_SB : OPCode.NOT_WORD_BOUND);
+ }
break;
case AnchorType.WORD_BEGIN:
- if (Config.USE_WORD_BEGIN_END)
- addOpcode(enc.isSingleByte() ? OPCode.WORD_BEGIN_SB : OPCode.WORD_BEGIN);
+ if (Config.USE_WORD_BEGIN_END) {
+ if (node.asciiRange) {
+ addOpcode(OPCode.ASCII_WORD_BEGIN);
+ } else {
+ addOpcode(enc.isSingleByte() ? OPCode.WORD_BEGIN_SB : OPCode.WORD_BEGIN);
+ }
+ }
break;
case AnchorType.WORD_END:
- if (Config.USE_WORD_BEGIN_END)
- addOpcode(enc.isSingleByte() ? OPCode.WORD_END_SB : OPCode.WORD_END);
+ if (Config.USE_WORD_BEGIN_END) {
+ if (node.asciiRange) {
+ addOpcode(OPCode.ASCII_WORD_END);
+ } else {
+ addOpcode(enc.isSingleByte() ? OPCode.WORD_END_SB : OPCode.WORD_END);
+ }
+ }
break;
case AnchorType.KEEP:
diff --git a/src/org/joni/ByteCodeMachine.java b/src/org/joni/ByteCodeMachine.java
index 10dc26e..f0f761f 100644
--- a/src/org/joni/ByteCodeMachine.java
+++ b/src/org/joni/ByteCodeMachine.java
@@ -241,6 +241,13 @@ class ByteCodeMachine extends StackMachine {
case OPCode.WORD_BEGIN: opWordBegin(); continue;
case OPCode.WORD_END: opWordEnd(); continue;
+ case OPCode.ASCII_WORD: opAsciiWord(); break;
+ case OPCode.NOT_ASCII_WORD: opNotAsciiWord(); break;
+ case OPCode.ASCII_WORD_BOUND: opAsciiWordBound(); break;
+ case OPCode.NOT_ASCII_WORD_BOUND: opNotAsciiWordBound(); continue;
+ case OPCode.ASCII_WORD_BEGIN: opAsciiWordBegin(); continue;
+ case OPCode.ASCII_WORD_END: opAsciiWordEnd(); continue;
+
case OPCode.BEGIN_BUF: opBeginBuf(); continue;
case OPCode.END_BUF: opEndBuf(); continue;
case OPCode.BEGIN_LINE: opBeginLine(); continue;
@@ -1031,6 +1038,12 @@ class ByteCodeMachine extends StackMachine {
sprev = sbegin; // break;
}
+ private void opAsciiWord() {
+ if (s >= range || !isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+ s += enc.length(bytes, s, end);
+ sprev = sbegin; // break;
+ }
+
private void opNotWord() {
if (s >= range || enc.isMbcWord(bytes, s, end)) {opFail(); return;}
s += enc.length(bytes, s, end);
@@ -1043,6 +1056,12 @@ class ByteCodeMachine extends StackMachine {
sprev = sbegin; // break;
}
+ private void opNotAsciiWord() {
+ if (s >= range || isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+ s += enc.length(bytes, s, end);
+ sprev = sbegin; // break;
+ }
+
private void opWordBound() {
if (s == str) {
if (s >= range || !enc.isMbcWord(bytes, s, end)) {opFail(); return;}
@@ -1063,6 +1082,16 @@ class ByteCodeMachine extends StackMachine {
}
}
+ private void opAsciiWordBound() {
+ if (s == str) {
+ if (s >= range || !isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+ } else if (s == end) {
+ if (sprev >= end || !isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+ } else {
+ if (isMbcAsciiWord(enc, bytes, s, end) == isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+ }
+ }
+
private void opNotWordBound() {
if (s == str) {
if (s < range && enc.isMbcWord(bytes, s, end)) {opFail(); return;}
@@ -1083,6 +1112,16 @@ class ByteCodeMachine extends StackMachine {
}
}
+ private void opNotAsciiWordBound() {
+ if (s == str) {
+ if (s < range && isMbcAsciiWord(enc, bytes, s, end)) {opFail(); return;}
+ } else if (s == end) {
+ if (sprev < end && isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+ } else {
+ if (isMbcAsciiWord(enc, bytes, s, end) != isMbcAsciiWord(enc, bytes, sprev, end)) {opFail(); return;}
+ }
+ }
+
private void opWordBegin() {
if (s < range && enc.isMbcWord(bytes, s, end)) {
if (s == str || !enc.isMbcWord(bytes, sprev, end)) return;
@@ -1097,6 +1136,13 @@ class ByteCodeMachine extends StackMachine {
opFail();
}
+ private void opAsciiWordBegin() {
+ if (s < range && isMbcAsciiWord(enc, bytes, s, end)) {
+ if (s == str || !isMbcAsciiWord(enc, bytes, sprev, end)) return;
+ }
+ opFail();
+ }
+
private void opWordEnd() {
if (s != str && enc.isMbcWord(bytes, sprev, end)) {
if (s == end || !enc.isMbcWord(bytes, s, end)) return;
@@ -1111,6 +1157,13 @@ class ByteCodeMachine extends StackMachine {
opFail();
}
+ private void opAsciiWordEnd() {
+ if (s != str && isMbcAsciiWord(enc, bytes, sprev, end)) {
+ if (s == end || !isMbcAsciiWord(enc, bytes, s, end)) return;
+ }
+ opFail();
+ }
+
private void opBeginBuf() {
if (s != str) opFail();
}
diff --git a/src/org/joni/Matcher.java b/src/org/joni/Matcher.java
index 5b623fe..0d4c767 100644
--- a/src/org/joni/Matcher.java
+++ b/src/org/joni/Matcher.java
@@ -24,6 +24,8 @@ import static org.joni.Option.isFindLongest;
import org.jcodings.Encoding;
import org.jcodings.IntHolder;
+import org.jcodings.constants.CharacterType;
+import org.jcodings.specific.ASCIIEncoding;
import org.joni.constants.AnchorType;
public abstract class Matcher extends IntHolder {
@@ -592,4 +594,8 @@ public abstract class Matcher extends IntHolder {
return icbuf == null ? icbuf = new byte[Config.ENC_MBC_CASE_FOLD_MAXLEN] : icbuf;
}
+ static boolean isMbcAsciiWord(Encoding enc, byte[]bytes, int p, int end) { // ONIGENC_IS_MBC_ASCII_WORD
+ return ASCIIEncoding.INSTANCE.isCodeCType(enc.mbcToCode(bytes, p, end), CharacterType.WORD);
+ }
+
}
diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java
index dfd21e2..9ac56fd 100644
--- a/src/org/joni/Parser.java
+++ b/src/org/joni/Parser.java
@@ -162,7 +162,7 @@ class Parser extends Lexer {
neg = false;
}
- if (token.type == TokenType.CC_CLOSE) {
+ if (token.type == TokenType.CC_CLOSE && !syntax.op2OptionECMAScript()) {
if (!codeExistCheck(']', true)) newSyntaxException(ERR_EMPTY_CHAR_CLASS);
env.ccEscWarn("]");
token.type = TokenType.CHAR; /* allow []...] */
diff --git a/test/org/joni/test/TestU8.java b/test/org/joni/test/TestU8.java
index 319b22c..fb69b45 100755
--- a/test/org/joni/test/TestU8.java
+++ b/test/org/joni/test/TestU8.java
@@ -203,10 +203,52 @@ public class TestU8 extends Test {
x2s("(?i)АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "абвгдеёжзийклмнопрстуфхцчшщъыьэюя", 0, 33 * 2);
x2s("(?i)АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", 0, 33 * 2);
+ x2s("(?u)\\w+", "あa#", 0, 4);
+ x2s("(?a)\\w+", "あa#", 3, 4);
+ x2s("(?u)\\W+", "あa#", 4, 5);
+ x2s("(?a)\\W+", "あa#", 0, 3);
+
+ x2s("(?a)\\b", "あa", 3, 3);
+ x2s("(?a)\\w\\b", "aあ", 0, 1);
+ x2s("(?a)\\B", "a ああ ", 2, 2);
+
+ x2s("(?u)\\B", "あ ", 4, 4);
+ x2s("(?a)\\B", "あ ", 0, 0);
+ x2s("(?a)\\B", "aあ ", 4, 4);
+
+ x2s("(?a)a\\b", " a", 1, 2);
+ x2s("(?u)a\\b", " a", 1, 2);
+ ns("(?a)a\\B", " a");
+ ns("(?a)あ\\b", " あ");
+ x2s("(?u)あ\\b", " あ", 1, 4);
+ x2s("(?a)あ\\B", " あ", 1, 4);
+ ns("(?u)あ\\B", " あ");
+
+ x2s("(?a)\\p{Alpha}\\P{Alpha}", "a。", 0, 4);
+ x2s("(?u)\\p{Alpha}\\P{Alpha}", "a。", 0, 4);
+ x2s("(?a)[[:word:]]+", "aあ", 0, 1);
+ x2s("(?a)[[:^word:]]+", "aあ", 1, 4);
+ x2s("(?u)[[:word:]]+", "aあ", 0, 4);
+ ns("(?u)[[:^word:]]+", "aあ");
+
x2s("(?iu)\\p{lower}\\p{upper}", "Ab", 0, 2);
x2s("(?ia)\\p{lower}\\p{upper}", "Ab", 0, 2);
x2s("(?iu)[[:lower:]][[:upper:]]", "Ab", 0, 2);
x2s("(?ia)[[:lower:]][[:upper:]]", "Ab", 0, 2);
+
+ ns("(?ia)\\w+", "\u212a\u017f");
+ ns("(?ia)[\\w]+", "\u212a\u017f");
+ ns("(?ia)[^\\W]+", "\u212a\u017f");
+ x2s("(?ia)[^\\W]+", "ks", 0, 2);
+ ns("(?iu)\\p{ASCII}", "\u212a");
+ ns("(?iu)\\P{ASCII}", "s");
+ ns("(?iu)[\\p{ASCII}]", "\u212a");
+ ns("(?iu)[\\P{ASCII}]", "s");
+ ns("(?ia)\\p{ASCII}", "\u212a");
+ ns("(?ia)\\P{ASCII}", "s");
+ ns("(?ia)[\\p{ASCII}]", "\u212a");
+ ns("(?ia)[\\P{ASCII}]", "s");
+
super.test();
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git
More information about the pkg-java-commits
mailing list