[jruby-joni] 38/194: first batch for ascii/unicode modifiers
Hideki Yamane
henrich at moszumanska.debian.org
Thu Feb 1 12:04:19 UTC 2018
This is an automated email from the git hooks/post-receive script.
henrich pushed a commit to branch debian/sid
in repository jruby-joni.
commit 416f5f9bcd8ce9482648702fbe428aa5ae0b51e6
Author: Marcin Mielzynski <lopx at gazeta.pl>
Date: Thu Dec 28 19:26:13 2017 +0100
first batch for ascii/unicode modifiers
---
src/org/joni/Analyser.java | 43 +++++-
src/org/joni/ApplyCaseFold.java | 40 +++--
src/org/joni/ApplyCaseFoldArg.java | 5 +-
src/org/joni/CodeRangeBuffer.java | 2 +-
src/org/joni/Config.java | 2 +-
src/org/joni/Lexer.java | 29 ++--
src/org/joni/Option.java | 12 ++
src/org/joni/Parser.java | 303 +++++++++++++++++++++----------------
src/org/joni/ScanEnvironment.java | 2 +-
src/org/joni/ast/CClassNode.java | 115 +++++++-------
test/org/joni/test/TestU8.java | 24 +--
11 files changed, 329 insertions(+), 248 deletions(-)
diff --git a/src/org/joni/Analyser.java b/src/org/joni/Analyser.java
index 3458468..b3458b7 100644
--- a/src/org/joni/Analyser.java
+++ b/src/org/joni/Analyser.java
@@ -35,9 +35,11 @@ import static org.joni.ast.QuantifierNode.isRepeatInfinite;
import java.util.HashSet;
import org.jcodings.CaseFoldCodeItem;
+import org.jcodings.Encoding;
import org.jcodings.ObjPtr;
import org.jcodings.Ptr;
import org.jcodings.constants.CharacterType;
+import org.jcodings.specific.ASCIIEncoding;
import org.joni.ast.AnchorNode;
import org.joni.ast.BackRefNode;
import org.joni.ast.CClassNode;
@@ -740,6 +742,10 @@ final class Analyser extends Parser {
return len;
}
+ boolean isMbcAsciiWord(Encoding enc, byte[]bytes, int p, int end) { // ONIGENC_IS_MBC_ASCII_WORD
+ return ASCIIEncoding.INSTANCE.isCodeCType(enc.mbcToCode(bytes, p, end), CharacterType.WORD);
+ }
+
/* x is not included y ==> 1 : 0 */
private boolean isNotIncluded(Node x, Node y) {
Node tmp;
@@ -755,7 +761,7 @@ final class Analyser extends Parser {
case NodeType.CTYPE:
CTypeNode cny = (CTypeNode)y;
CTypeNode cnx = (CTypeNode)x;
- return cny.ctype == cnx.ctype && cny.not != cnx.not;
+ return cny.ctype == cnx.ctype && cny.not != cnx.not && cny.asciiRange == cnx.asciiRange;
case NodeType.CCLASS:
// !swap:!
@@ -788,15 +794,27 @@ final class Analyser extends Parser {
if (xc.mbuf == null && !xc.isNot()) {
for (int i=0; i<BitSet.SINGLE_BYTE_SIZE; i++) {
if (xc.bs.at(i)) {
- if (enc.isSbWord(i)) return false;
+ if (((CTypeNode)y).asciiRange) {
+ if (enc.isSbWord(i)) return false;
+ } else {
+ if (enc.isWord(i)) return false;
+ }
}
}
return true;
}
return false;
} else {
+ if (xc.mbuf != null) return false;
for (int i=0; i<BitSet.SINGLE_BYTE_SIZE; i++) {
- if (!enc.isSbWord(i)) {
+ boolean isWord;
+ if (((CTypeNode)y).asciiRange) {
+ isWord = enc.isSbWord(i);
+ } else {
+ isWord = enc.isWord(i);
+ }
+
+ if (!isWord) {
if (!xc.isNot()) {
if (xc.bs.at(i)) return false;
} else {
@@ -849,10 +867,18 @@ final class Analyser extends Parser {
CTypeNode cy = ((CTypeNode)y);
switch (cy.ctype) {
case CharacterType.WORD:
- if (enc.isMbcWord(xs.bytes, xs.p, xs.end)) {
- return cy.not;
+ if (cy.asciiRange) {
+ if (isMbcAsciiWord(enc, xs.bytes, xs.p, xs.end)) {
+ return cy.not;
+ } else {
+ return !cy.not;
+ }
} else {
- return !cy.not;
+ if (enc.isMbcWord(xs.bytes, xs.p, xs.end)) {
+ return cy.not;
+ } else {
+ return !cy.not;
+ }
}
default:
@@ -2062,16 +2088,17 @@ final class Analyser extends Parser {
min = 1;
CTypeNode cn = (CTypeNode)node;
+ int maxCode = cn.asciiRange ? 0x80 : BitSet.SINGLE_BYTE_SIZE;
switch (cn.ctype) {
case CharacterType.WORD:
if (cn.not) {
for (int i=0; i<BitSet.SINGLE_BYTE_SIZE; i++) {
- if (!enc.isWord(i)) {
+ if (!enc.isWord(i) || i >= maxCode) {
opt.map.addChar((byte)i, enc);
}
}
} else {
- for (int i=0; i<BitSet.SINGLE_BYTE_SIZE; i++) {
+ for (int i=0; i<maxCode; i++) {
if (enc.isWord(i)) {
opt.map.addChar((byte)i, enc);
}
diff --git a/src/org/joni/ApplyCaseFold.java b/src/org/joni/ApplyCaseFold.java
index 7dd84ce..496ff71 100644
--- a/src/org/joni/ApplyCaseFold.java
+++ b/src/org/joni/ApplyCaseFold.java
@@ -34,30 +34,44 @@ final class ApplyCaseFold implements ApplyAllCaseFoldFunction {
ScanEnvironment env = arg.env;
Encoding enc = env.enc;
CClassNode cc = arg.cc;
+ CClassNode ascCc = arg.ascCc;
BitSet bs = cc.bs;
+ boolean addFlag;
+
+ if (ascCc == null) {
+ addFlag = false;
+ } else if (Encoding.isAscii(from) == Encoding.isAscii(to[0])) {
+ addFlag = true;
+ } else {
+ addFlag = ascCc.isCodeInCC(enc, from);
+ if (ascCc.isNot()) addFlag = !addFlag;
+ }
if (length == 1) {
boolean inCC = cc.isCodeInCC(enc, from);
-
if (Config.CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS) {
if ((inCC && !cc.isNot()) || (!inCC && cc.isNot())) {
- if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE) {
- cc.addCodeRange(env, to[0], to[0]);
- } else {
- /* /(?i:[^A-C])/.match("a") ==> fail. */
- bs.set(to[0]);
+ if (addFlag) {
+ if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE) {
+ cc.addCodeRange(env, to[0], to[0]);
+ } else {
+ /* /(?i:[^A-C])/.match("a") ==> fail. */
+ bs.set(to[0]);
+ }
}
}
} else {
if (inCC) {
- if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE) {
- if (cc.isNot()) cc.clearNotFlag(enc);
- cc.addCodeRange(env, to[0], to[0]);
- } else {
- if (cc.isNot()) {
- bs.clear(to[0]);
+ if (addFlag) {
+ if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE) {
+ if (cc.isNot()) cc.clearNotFlag(enc);
+ cc.addCodeRange(env, to[0], to[0]);
} else {
- bs.set(to[0]);
+ if (cc.isNot()) {
+ bs.clear(to[0]);
+ } else {
+ bs.set(to[0]);
+ }
}
}
}
diff --git a/src/org/joni/ApplyCaseFoldArg.java b/src/org/joni/ApplyCaseFoldArg.java
index 10b297f..ec7cbaa 100644
--- a/src/org/joni/ApplyCaseFoldArg.java
+++ b/src/org/joni/ApplyCaseFoldArg.java
@@ -24,12 +24,13 @@ import org.joni.ast.ConsAltNode;
public final class ApplyCaseFoldArg {
final ScanEnvironment env;
- final CClassNode cc;
+ final CClassNode cc, ascCc;
ConsAltNode altRoot;
ConsAltNode tail;
- public ApplyCaseFoldArg(ScanEnvironment env, CClassNode cc) {
+ public ApplyCaseFoldArg(ScanEnvironment env, CClassNode cc, CClassNode ascCc) {
this.env = env;
this.cc = cc;
+ this.ascCc = ascCc;
}
}
diff --git a/src/org/joni/CodeRangeBuffer.java b/src/org/joni/CodeRangeBuffer.java
index 137772a..5b4edea 100644
--- a/src/org/joni/CodeRangeBuffer.java
+++ b/src/org/joni/CodeRangeBuffer.java
@@ -25,7 +25,7 @@ import org.joni.exception.ValueException;
public final class CodeRangeBuffer {
private static final int INIT_MULTI_BYTE_RANGE_SIZE = 5;
- private static final int ALL_MULTI_BYTE_RANGE = 0x7fffffff;
+ public static final int ALL_MULTI_BYTE_RANGE = 0x7fffffff;
int[]p;
int used;
diff --git a/src/org/joni/Config.java b/src/org/joni/Config.java
index 6802817..42b007b 100644
--- a/src/org/joni/Config.java
+++ b/src/org/joni/Config.java
@@ -75,7 +75,7 @@ public interface Config extends org.jcodings.Config {
final int CHECK_STRING_THRESHOLD_LEN = 7;
final int CHECK_BUFF_MAX_SIZE = 0x4000;
- final boolean NON_UNICODE_SDW = true;
+ final boolean NON_UNICODE_SDW = false;
final PrintStream log = System.out;
diff --git a/src/org/joni/Lexer.java b/src/org/joni/Lexer.java
index 886f660..24b5a8d 100644
--- a/src/org/joni/Lexer.java
+++ b/src/org/joni/Lexer.java
@@ -21,6 +21,7 @@ package org.joni;
import static org.joni.Option.isAsciiRange;
import static org.joni.Option.isSingleline;
+import static org.joni.Option.isWordBoundAllRange;
import static org.joni.ast.QuantifierNode.isRepeatInfinite;
import org.jcodings.Ptr;
@@ -672,22 +673,22 @@ class Lexer extends ScannerSupport {
switch (c) {
case 'w':
- fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
+ fetchTokenInCCFor_charType(false, CharacterType.WORD);
break;
case 'W':
- fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
+ fetchTokenInCCFor_charType(true, CharacterType.WORD);
break;
case 'd':
- fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
+ fetchTokenInCCFor_charType(false, CharacterType.DIGIT);
break;
case 'D':
- fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
+ fetchTokenInCCFor_charType(true, CharacterType.DIGIT);
break;
case 's':
- fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
+ fetchTokenInCCFor_charType(false, CharacterType.SPACE);
break;
case 'S':
- fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
+ fetchTokenInCCFor_charType(true, CharacterType.SPACE);
break;
case 'h':
if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
@@ -1058,21 +1059,21 @@ class Lexer extends ScannerSupport {
if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE;
break;
case 'w':
- if (syntax.opEscWWord()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
+ if (syntax.opEscWWord()) fetchTokenInCCFor_charType(false, CharacterType.WORD);
break;
case 'W':
- if (syntax.opEscWWord()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
+ if (syntax.opEscWWord()) fetchTokenInCCFor_charType(true, CharacterType.WORD);
break;
case 'b':
if (syntax.opEscBWordBound()) {
fetchTokenFor_anchor(AnchorType.WORD_BOUND);
- token.setAnchorASCIIRange(isAsciiRange(env.option));
+ token.setAnchorASCIIRange(isAsciiRange(env.option) && !isWordBoundAllRange(env.option));
}
break;
case 'B':
if (syntax.opEscBWordBound()) {
fetchTokenFor_anchor(AnchorType.NOT_WORD_BOUND);
- token.setAnchorASCIIRange(isAsciiRange(env.option));
+ token.setAnchorASCIIRange(isAsciiRange(env.option) && !isWordBoundAllRange(env.option));
}
break;
case '<':
@@ -1088,16 +1089,16 @@ class Lexer extends ScannerSupport {
}
break;
case 's':
- if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
+ if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(false, CharacterType.SPACE);
break;
case 'S':
- if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
+ if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(true, CharacterType.SPACE);
break;
case 'd':
- if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
+ if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(false, CharacterType.DIGIT);
break;
case 'D':
- if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
+ if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(true, CharacterType.DIGIT);
break;
case 'h':
if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
diff --git a/src/org/joni/Option.java b/src/org/joni/Option.java
index ce086dd..93c7d6a 100644
--- a/src/org/joni/Option.java
+++ b/src/org/joni/Option.java
@@ -126,6 +126,18 @@ public class Option {
return (option & ASCII_RANGE) != 0;
}
+ public static boolean isPosixBracketAllRange(int option) {
+ return (option & POSIX_BRACKET_ALL_RANGE) != 0;
+ }
+
+ public static boolean isWordBoundAllRange(int option) {
+ return (option & WORD_BOUND_ALL_RANGE) != 0;
+ }
+
+ public static boolean isNewlineCRLF(int option) {
+ return (option & NEWLINE_CRLF) != 0;
+ }
+
/* OP_SET_OPTION is required for these options. ??? */
// public static boolean isDynamic(int option) {
// return (option & (MULTILINE | IGNORECASE)) != 0;
diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java
index 47c3aa0..dfd21e2 100644
--- a/src/org/joni/Parser.java
+++ b/src/org/joni/Parser.java
@@ -21,10 +21,13 @@ package org.joni;
import static org.joni.BitStatus.bsOnAtSimple;
import static org.joni.BitStatus.bsOnOff;
+import static org.joni.Option.isAsciiRange;
import static org.joni.Option.isDontCaptureGroup;
import static org.joni.Option.isIgnoreCase;
+import static org.joni.Option.isPosixBracketAllRange;
import org.jcodings.Encoding;
+import org.jcodings.ObjPtr;
import org.jcodings.Ptr;
import org.jcodings.constants.CharacterType;
import org.jcodings.constants.PosixBracket;
@@ -70,7 +73,7 @@ class Parser extends Lexer {
private static final int POSIX_BRACKET_NAME_MIN_LEN = 4;
private static final int POSIX_BRACKET_CHECK_LIMIT_LENGTH = 20;
private static final byte BRACKET_END[] = ":]".getBytes();
- private boolean parsePosixBracket(CClassNode cc) {
+ private boolean parsePosixBracket(CClassNode cc, CClassNode ascCc) {
mark();
boolean not;
@@ -81,16 +84,23 @@ class Parser extends Lexer {
not = false;
}
if (enc.strLength(bytes, p, stop) >= POSIX_BRACKET_NAME_MIN_LEN + 3) { // else goto not_posix_bracket
- byte[][] pbs= PosixBracket.PBSNamesLower;
- for (int i=0; i<pbs.length; i++) {
- byte[]name = pbs[i];
+ boolean asciiRange = isAsciiRange(env.option) && !isPosixBracketAllRange(env.option);
+
+ for (int i=0; i<PosixBracket.PBSNamesLower.length; i++) {
+ byte[]name = PosixBracket.PBSNamesLower[i];
// hash lookup here ?
if (enc.strNCmp(bytes, p, stop, name, 0, name.length) == 0) {
p = enc.step(bytes, p, stop, name.length);
if (enc.strNCmp(bytes, p, stop, BRACKET_END, 0, BRACKET_END.length) != 0) {
newSyntaxException(ERR_INVALID_POSIX_BRACKET_TYPE);
}
- cc.addCType(PosixBracket.PBSValues[i], not, env, this);
+ int ctype = PosixBracket.PBSValues[i];
+ cc.addCType(ctype, not, asciiRange, env, this);
+ if (ascCc != null) {
+ if (ctype != CharacterType.WORD && ctype != CharacterType.ASCII && !asciiRange) {
+ ascCc.addCType(ctype, not, asciiRange, env, this);
+ }
+ }
inc();
inc();
return false;
@@ -139,10 +149,12 @@ class Parser extends Lexer {
return false;
}
- private CClassNode parseCharClass() {
- fetchTokenInCC();
-
+ private CClassNode parseCharClass(ObjPtr<CClassNode> ascNode) {
final boolean neg;
+ CClassNode cc, prevCc = null, ascCc = null, ascPrevCc = null, workCc = null, ascWorkCc = null;
+ CCStateArg arg = new CCStateArg();
+
+ fetchTokenInCC();
if (token.type == TokenType.CHAR && token.getC() == '^' && !token.escaped) {
neg = true;
fetchTokenInCC();
@@ -150,26 +162,21 @@ class Parser extends Lexer {
neg = false;
}
- if (token.type == TokenType.CC_CLOSE && !syntax.op2OptionECMAScript()) {
+ if (token.type == TokenType.CC_CLOSE) {
if (!codeExistCheck(']', true)) newSyntaxException(ERR_EMPTY_CHAR_CLASS);
env.ccEscWarn("]");
token.type = TokenType.CHAR; /* allow []...] */
}
- CClassNode cc = new CClassNode();
- CClassNode prevCC = null;
- CClassNode workCC = null;
-
- CCStateArg arg = new CCStateArg();
+ cc = new CClassNode();
+ if (isIgnoreCase(env.option)) ascNode.p = new CClassNode();
boolean andStart = false;
arg.state = CCSTATE.START;
-
while (token.type != TokenType.CC_CLOSE) {
boolean fetched = false;
switch (token.type) {
-
case CHAR:
final int len;
if (token.getCode() >= BitSet.SINGLE_BYTE_SIZE || (len = enc.codeToMbcLength(token.getC())) > 1) {
@@ -177,9 +184,9 @@ class Parser extends Lexer {
} else {
arg.inType = CCVALTYPE.SB; // sb_char:
}
- arg.v = token.getC();
- arg.vIsRaw = false;
- parseCharClassValEntry2(cc, arg); // goto val_entry2
+ arg.to = token.getC();
+ arg.toIsRaw = false;
+ parseCharClassValEntry2(cc, ascCc, arg); // goto val_entry2
break;
case RAW_BYTE:
@@ -208,47 +215,57 @@ class Parser extends Lexer {
fetched = false;
}
if (i == 1) {
- arg.v = buf[0] & 0xff;
+ arg.to = buf[0] & 0xff;
arg.inType = CCVALTYPE.SB; // goto raw_single
} else {
- arg.v = enc.mbcToCode(buf, 0, buf.length);
+ arg.to = enc.mbcToCode(buf, 0, buf.length);
arg.inType = CCVALTYPE.CODE_POINT;
}
} else {
- arg.v = token.getC();
+ arg.to = token.getC();
arg.inType = CCVALTYPE.SB; // raw_single:
}
- arg.vIsRaw = true;
- parseCharClassValEntry2(cc, arg); // goto val_entry2
+ arg.toIsRaw = true;
+ parseCharClassValEntry2(cc, ascCc, arg); // goto val_entry2
break;
case CODE_POINT:
- arg.v = token.getCode();
- arg.vIsRaw = true;
- parseCharClassValEntry(cc, arg); // val_entry:, val_entry2
+ arg.to = token.getCode();
+ arg.toIsRaw = true;
+ parseCharClassValEntry(cc, ascCc, arg); // val_entry:, val_entry2
break;
case POSIX_BRACKET_OPEN:
- if (parsePosixBracket(cc)) { /* true: is not POSIX bracket */
+ if (parsePosixBracket(cc, ascCc)) { /* true: is not POSIX bracket */
env.ccEscWarn("[");
p = token.backP;
- arg.v = token.getC();
- arg.vIsRaw = false;
- parseCharClassValEntry(cc, arg); // goto val_entry
+ arg.to = token.getC();
+ arg.toIsRaw = false;
+ parseCharClassValEntry(cc, ascCc, arg); // goto val_entry
break;
}
- cc.nextStateClass(arg, env); // goto next_class
+ cc.nextStateClass(arg, ascCc, env); // goto next_class
break;
case CHAR_TYPE:
- cc.addCType(token.getPropCType(), token.getPropNot(), env, this);
- cc.nextStateClass(arg, env); // next_class:
+ cc.addCType(token.getPropCType(), token.getPropNot(), isAsciiRange(env.option), env, this);
+ if (ascCc != null) {
+ if (token.getPropCType() != CharacterType.WORD) {
+ ascCc.addCType(token.getPropCType(), token.getPropNot(), isAsciiRange(env.option), env, this);
+ }
+ }
+ cc.nextStateClass(arg, ascCc, env); // next_class:
break;
case CHAR_PROPERTY:
int ctype = fetchCharPropertyToCType();
- cc.addCType(ctype, token.getPropNot(), env, this);
- cc.nextStateClass(arg, env); // goto next_class
+ cc.addCType(ctype, token.getPropNot(), false, env, this);
+ if (ascCc != null) {
+ if (ctype != CharacterType.ASCII) {
+ ascCc.addCType(ctype, token.getPropNot(), false, env, this);
+ }
+ }
+ cc.nextStateClass(arg, ascCc, env); // goto next_class
break;
case CC_RANGE:
@@ -256,41 +273,43 @@ class Parser extends Lexer {
fetchTokenInCC();
fetched = true;
if (token.type == TokenType.CC_CLOSE) { /* allow [x-] */
- parseCharClassRangeEndVal(cc, arg); // range_end_val:, goto val_entry;
+ parseCharClassRangeEndVal(cc, ascCc, arg); // range_end_val:, goto val_entry;
break;
} else if (token.type == TokenType.CC_AND) {
env.ccEscWarn("-");
- parseCharClassRangeEndVal(cc, arg); // goto range_end_val
+ parseCharClassRangeEndVal(cc, ascCc, arg); // goto range_end_val
break;
}
+ if (arg.type == CCVALTYPE.CLASS) newValueException(ERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS);
arg.state = CCSTATE.RANGE;
} else if (arg.state == CCSTATE.START) {
- arg.v = token.getC(); /* [-xa] is allowed */
- arg.vIsRaw = false;
+ arg.to = token.getC(); /* [-xa] is allowed */
+ arg.toIsRaw = false;
fetchTokenInCC();
fetched = true;
if (token.type == TokenType.CC_RANGE || andStart) env.ccEscWarn("-"); /* [--x] or [a&&-x] is warned. */
- parseCharClassValEntry(cc, arg); // goto val_entry
+ parseCharClassValEntry(cc, ascCc, arg); // goto val_entry
break;
} else if (arg.state == CCSTATE.RANGE) {
env.ccEscWarn("-");
- parseCharClassSbChar(cc, arg); // goto sb_char /* [!--x] is allowed */
+ parseCharClassSbChar(cc, ascCc, arg); // goto sb_char /* [!--x] is allowed */
break;
} else { /* CCS_COMPLETE */
fetchTokenInCC();
fetched = true;
if (token.type == TokenType.CC_CLOSE) { /* allow [a-b-] */
- parseCharClassRangeEndVal(cc, arg); // goto range_end_val
+ parseCharClassRangeEndVal(cc, ascCc, arg); // goto range_end_val
break;
} else if (token.type == TokenType.CC_AND) {
env.ccEscWarn("-");
- parseCharClassRangeEndVal(cc, arg); // goto range_end_val
+ parseCharClassRangeEndVal(cc, ascCc, arg); // goto range_end_val
break;
}
if (syntax.allowDoubleRangeOpInCC()) {
env.ccEscWarn("-");
- parseCharClassSbChar(cc, arg); // goto sb_char /* [0-9-a] is allowed as [0-9\-a] */
+ // parseCharClassSbChar(cc, ascCc, arg); // goto sb_char /* [0-9-a] is allowed as [0-9\-a] */
+ parseCharClassRangeEndVal(cc, ascCc, arg); // goto range_end_val
break;
}
newSyntaxException(ERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS);
@@ -298,27 +317,40 @@ class Parser extends Lexer {
break;
case CC_CC_OPEN: /* [ */
- CClassNode acc = parseCharClass();
+ ObjPtr<CClassNode> ascPtr = new ObjPtr<CClassNode>();
+ CClassNode acc = parseCharClass(ascPtr);
cc.or(acc, enc);
+ if (ascPtr.p != null) {
+ ascCc.or(ascPtr.p, enc);
+ }
break;
case CC_AND: /* && */
if (arg.state == CCSTATE.VALUE) {
- arg.v = 0; // ??? safe v ?
- arg.vIsRaw = false;
- cc.nextStateValue(arg, env);
+ arg.to = 0;
+ arg.toIsRaw = false;
+ cc.nextStateValue(arg, ascCc, env);
}
/* initialize local variables */
andStart = true;
arg.state = CCSTATE.START;
- if (prevCC != null) {
- prevCC.and(cc, enc);
+ if (prevCc != null) {
+ prevCc.and(cc, enc);
+ if (ascCc != null) {
+ ascPrevCc.and(ascCc, enc);
+ }
} else {
- prevCC = cc;
- if (workCC == null) workCC = new CClassNode();
- cc = workCC;
+ prevCc = cc;
+ if (workCc == null) workCc = new CClassNode();
+ cc = workCc;
+ if (ascCc != null) {
+ ascPrevCc = ascCc;
+ if (ascWorkCc == null) ascWorkCc = new CClassNode();
+ ascCc = ascWorkCc;
+ }
}
cc.clear();
+ if (ascCc != null) ascCc.clear();
break;
case EOT:
@@ -333,24 +365,30 @@ class Parser extends Lexer {
} // while
if (arg.state == CCSTATE.VALUE) {
- arg.v = 0; // ??? safe v ?
- arg.vIsRaw = false;
- cc.nextStateValue(arg, env);
+ arg.to = 0;
+ arg.toIsRaw = false;
+ cc.nextStateValue(arg, ascCc, env);
}
- if (prevCC != null) {
- prevCC.and(cc, enc);
- cc = prevCC;
+ if (prevCc != null) {
+ prevCc.and(cc, enc);
+ cc = prevCc;
+ if (ascCc != null) {
+ ascPrevCc.and(ascCc, enc);
+ ascCc = ascPrevCc;
+ }
}
if (neg) {
cc.setNot();
+ if (ascCc != null) ascCc.setNot();
} else {
cc.clearNot();
+ if (ascCc != null) ascCc.clearNot();
}
if (cc.isNot() && syntax.notNewlineInNegativeCC()) {
- if (!cc.isEmpty()) {
+ if (!cc.isEmpty()) { // ???
final int NEW_LINE = 0x0a;
if (enc.isNewLine(NEW_LINE)) {
if (enc.codeToMbcLength(NEW_LINE) == 1) {
@@ -365,27 +403,27 @@ class Parser extends Lexer {
return cc;
}
- private void parseCharClassSbChar(CClassNode cc, CCStateArg arg) {
+ private void parseCharClassSbChar(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
arg.inType = CCVALTYPE.SB;
- arg.v = token.getC();
- arg.vIsRaw = false;
- parseCharClassValEntry2(cc, arg); // goto val_entry2
+ arg.to = token.getC();
+ arg.toIsRaw = false;
+ parseCharClassValEntry2(cc, ascCc, arg); // goto val_entry2
}
- private void parseCharClassRangeEndVal(CClassNode cc, CCStateArg arg) {
- arg.v = '-';
- arg.vIsRaw = false;
- parseCharClassValEntry(cc, arg); // goto val_entry
+ private void parseCharClassRangeEndVal(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
+ arg.to = '-';
+ arg.toIsRaw = false;
+ parseCharClassValEntry(cc, ascCc, arg); // goto val_entry
}
- private void parseCharClassValEntry(CClassNode cc, CCStateArg arg) {
- int len = enc.codeToMbcLength(arg.v);
+ private void parseCharClassValEntry(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
+ int len = enc.codeToMbcLength(arg.to);
arg.inType = len == 1 ? CCVALTYPE.SB : CCVALTYPE.CODE_POINT;
- parseCharClassValEntry2(cc, arg); // val_entry2:
+ parseCharClassValEntry2(cc, ascCc, arg); // val_entry2:
}
- private void parseCharClassValEntry2(CClassNode cc, CCStateArg arg) {
- cc.nextStateValue(arg, env);
+ private void parseCharClassValEntry2(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
+ cc.nextStateValue(arg, ascCc, env);
}
private Node parseEnclose(TokenType term) {
@@ -728,6 +766,8 @@ class Parser extends Lexer {
Node node = null;
boolean group = false;
+ // if (tok->type == (enum TokenSyms )term) goto end_of_token; ???
+
switch(token.type) {
case ALT:
case EOT:
@@ -742,6 +782,7 @@ class Parser extends Lexer {
EncloseNode en = (EncloseNode)node;
env.option = en.option;
fetchToken();
+ // env.option = prev; // ???
Node target = parseSubExp(term);
env.option = prev;
en.setTarget(target);
@@ -811,6 +852,7 @@ class Parser extends Lexer {
case ANCHOR:
node = new AnchorNode(token.getAnchorSubtype());
+ ((AnchorNode)node).asciiRange = token.getAnchorASCIIRange();
break;
case OP_REPEAT:
@@ -926,8 +968,8 @@ class Parser extends Lexer {
int sbOut = enc.minLength() > 1 ? 0x00 : 0x80;
int extend = GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Extend);
CClassNode cc = new CClassNode();
- cc.addCType(extend, false, env, this);
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_SpacingMark), false, env, this);
+ cc.addCType(extend, false, false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_SpacingMark), false, false, env, this);
cc.addCodeRange(env, 0x200D, 0x200D);
QuantifierNode qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
@@ -937,7 +979,7 @@ class Parser extends Lexer {
/* !Control */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Control), true, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Control), true, false, env, this);
if (enc.minLength() > 1) {
CodeRangeBuffer buff = new CodeRangeBuffer();
buff = CodeRangeBuffer.addCodeRange(buff, env, 0x0a, 0x0a);
@@ -959,21 +1001,21 @@ class Parser extends Lexer {
/* T+ */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, false, env, this);
qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
alt = ConsAltNode.newAltNode(qn, alt);
/* L+ */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, false, env, this);
qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
alt = ConsAltNode.newAltNode(qn, alt);
/* L* LVT T* */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
@@ -981,11 +1023,11 @@ class Parser extends Lexer {
list2 = ConsAltNode.newListNode(qn, null);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_LVT), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_LVT), false, false, env, this);
list2 = ConsAltNode.newListNode(cc, list2);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
@@ -994,23 +1036,23 @@ class Parser extends Lexer {
/* L* LV V* T* */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, null);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_V), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_V), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_LV), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_LV), false, false, env, this);
list2 = ConsAltNode.newListNode(cc, list2);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
@@ -1019,19 +1061,19 @@ class Parser extends Lexer {
/* L* V+ T* */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, null);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_V), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_V), false, false, env, this);
qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
@@ -1044,33 +1086,33 @@ class Parser extends Lexer {
/* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, false, env, this);
qn = new QuantifierNode(0, 1, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, null);
cc = new CClassNode();
- cc.addCType(extend, false, env, this);
+ cc.addCType(extend, false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, false, env, this);
list2 = ConsAltNode.newListNode(cc, list2);
ConsAltNode alt2 = ConsAltNode.newAltNode(list2, null);
/* Glue_After_Zwj */
cc = new CClassNode();
- cc.addCType(extend, false, env, this);
+ cc.addCType(extend, false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, null);
cc = new CClassNode();
cc.addCTypeByRange(-1, false, enc, sbOut, GraphemeNames.Glue_After_Zwj_Ranges);
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Glue_After_Zwj), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Glue_After_Zwj), false, false, env, this);
list2 = ConsAltNode.newListNode(cc, list2);
alt2 = ConsAltNode.newAltNode(list2, alt2);
@@ -1106,14 +1148,14 @@ class Parser extends Lexer {
/* E_Modifier? */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, false, env, this);
qn = new QuantifierNode(0, 1, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
/* Extend* */
cc = new CClassNode();
- cc.addCType(extend, false, env, this);
+ cc.addCType(extend, false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
@@ -1121,8 +1163,8 @@ class Parser extends Lexer {
/* (E_Base | EBG) */
cc = new CClassNode();
cc.addCTypeByRange(-1, false, enc, sbOut, GraphemeNames.E_Base_Ranges);
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base), false, env, this);
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base), false, false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, false, env, this);
list2 = ConsAltNode.newListNode(cc, list2);
alt = ConsAltNode.newAltNode(list2, alt);
@@ -1133,14 +1175,14 @@ class Parser extends Lexer {
* http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html
*/
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, false, env, this);
qn = new QuantifierNode(0, 1, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, null);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Glue_After_Zwj), false, env, this);
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Glue_After_Zwj), false, false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, false, env, this);
list2 = ConsAltNode.newListNode(cc, list2);
str = new StringNode();
@@ -1163,7 +1205,7 @@ class Parser extends Lexer {
/* Prepend* */
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Prepend), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Prepend), false, false, env, this);
qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list = ConsAltNode.newListNode(qn, list);
@@ -1185,7 +1227,7 @@ class Parser extends Lexer {
list2 = ConsAltNode.newListNode(qn, null);
cc = new CClassNode();
- cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Prepend), false, env, this);
+ cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Prepend), false, false, env, this);
qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(cc);
list2 = ConsAltNode.newListNode(qn, list2);
@@ -1341,7 +1383,7 @@ class Parser extends Lexer {
private Node parseCodePoint() {
byte[]buf = new byte[Config.ENC_CODE_TO_MBC_MAXLEN];
int num = enc.codeToMbc(token.getCode(), buf, 0);
- // #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG ... // setRaw() #else
+ // #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG ... // setRaw() #else // ???
return new StringNode(buf, 0, num);
}
@@ -1358,55 +1400,56 @@ class Parser extends Lexer {
private Node parseCharType(Node node) {
switch(token.getPropCType()) {
- case CharacterType.D:
- case CharacterType.S:
- case CharacterType.W:
- if (Config.NON_UNICODE_SDW) {
- CClassNode cc = new CClassNode();
- cc.addCType(token.getPropCType(), false, env, this);
- if (token.getPropNot()) cc.setNot();
- node = cc;
- }
- break;
-
case CharacterType.WORD:
- node = new CTypeNode(token.getPropCType(), token.getPropNot(), false);
+ node = new CTypeNode(token.getPropCType(), token.getPropNot(), isAsciiRange(env.option));
break;
case CharacterType.SPACE:
case CharacterType.DIGIT:
case CharacterType.XDIGIT:
CClassNode ccn = new CClassNode();
- ccn.addCType(token.getPropCType(), false, env, this);
+ ccn.addCType(token.getPropCType(), false, isAsciiRange(env.option), env, this);
if (token.getPropNot()) ccn.setNot();
node = ccn;
break;
default:
newInternalException(ERR_PARSER_BUG);
-
} // inner switch
return node;
}
- private CClassNode parseCharProperty() {
+ private Node cClassCaseFold(Node node, CClassNode cc, CClassNode ascCc) {
+ ApplyCaseFoldArg arg = new ApplyCaseFoldArg(env, cc, ascCc);
+ enc.applyAllCaseFold(env.caseFoldFlag, ApplyCaseFold.INSTANCE, arg);
+ if (arg.altRoot != null) {
+ node = ConsAltNode.newAltNode(node, arg.altRoot);
+ }
+ return node;
+ }
+
+ private Node parseCharProperty() {
int ctype = fetchCharPropertyToCType();
- CClassNode n = new CClassNode();
- n.addCType(ctype, false, env, this);
- if (token.getPropNot()) n.setNot();
- return n;
+ CClassNode cc = new CClassNode();
+ Node node = cc;
+ cc.addCType(ctype, false, false, env, this);
+ if (token.getPropNot()) cc.setNot();
+
+ if (isIgnoreCase(env.option)) {
+ if (ctype != CharacterType.ASCII) {
+ node = cClassCaseFold(node, cc, cc);
+ }
+ }
+ return node;
}
private Node parseCcCcOpen() {
- CClassNode cc = parseCharClass();
+ ObjPtr<CClassNode> ascPtr = new ObjPtr<CClassNode>();
+ CClassNode cc = parseCharClass(ascPtr);
Node node = cc;
- if (isIgnoreCase(env.option)) {
- ApplyCaseFoldArg arg = new ApplyCaseFoldArg(env, cc);
- enc.applyAllCaseFold(env.caseFoldFlag, ApplyCaseFold.INSTANCE, arg);
- if (arg.altRoot != null) {
- node = ConsAltNode.newAltNode(node, arg.altRoot);
- }
+ if (isIgnoreCase(env.option)) {
+ node = cClassCaseFold(node, cc, ascPtr.p);
}
return node;
}
diff --git a/src/org/joni/ScanEnvironment.java b/src/org/joni/ScanEnvironment.java
index 4c68a04..9731881 100644
--- a/src/org/joni/ScanEnvironment.java
+++ b/src/org/joni/ScanEnvironment.java
@@ -27,7 +27,7 @@ import org.joni.exception.ErrorMessages;
import org.joni.exception.InternalException;
public final class ScanEnvironment {
- int option;
+ public int option;
final int caseFoldFlag;
final public Encoding enc;
final public Syntax syntax;
diff --git a/src/org/joni/ast/CClassNode.java b/src/org/joni/ast/CClassNode.java
index 5bfe05a..3ee8fa4 100644
--- a/src/org/joni/ast/CClassNode.java
+++ b/src/org/joni/ast/CClassNode.java
@@ -256,41 +256,33 @@ public final class CClassNode extends Node {
}
}
- public void addCType(int ctype, boolean not, ScanEnvironment env, IntHolder sbOut) {
+ // add_ctype_to_cc
+ public void addCType(int ctype, boolean not, boolean asciiRange, ScanEnvironment env, IntHolder sbOut) {
Encoding enc = env.enc;
-
- if (Config.NON_UNICODE_SDW) {
- switch(ctype) {
- case CharacterType.S:
- if (!not && env.syntax.op2OptionECMAScript()) {
- // treat \u2028 and \u2029 as whitespace
- addCodeRange(env, 8232, 8233);
- }
- case CharacterType.D:
- case CharacterType.W:
- ctype ^= CharacterType.SPECIAL_MASK;
+ int[]ranges = enc.ctypeCodeRange(ctype, sbOut);
+ if (ranges != null) {
+ if (asciiRange) {
+ CClassNode ccWork = new CClassNode();
+ ccWork.addCTypeByRange(ctype, not, enc, sbOut.value, ranges);
if (not) {
- for (int c = 0; c < BitSet.SINGLE_BYTE_SIZE; c++) {
- if (!ASCIIEncoding.INSTANCE.isCodeCType(c, ctype)) bs.set(c);
- //if ((AsciiTables.AsciiCtypeTable[c] & (1 << ctype)) == 0) bs.set(c);
- }
- addAllMultiByteRange(enc);
+ ccWork.addCodeRangeToBuf(0x80, CodeRangeBuffer.ALL_MULTI_BYTE_RANGE); // add_code_range_to_buf0
} else {
- for (int c = 0; c < BitSet.SINGLE_BYTE_SIZE; c++) {
- if (ASCIIEncoding.INSTANCE.isCodeCType(c, ctype)) bs.set(c);
- //if ((AsciiTables.AsciiCtypeTable[c] & (1 << ctype)) != 0) bs.set(c);
+ CClassNode ccAscii = new CClassNode();
+ if (enc.minLength() > 1) {
+ ccAscii.addCodeRangeToBuf(0x00, 0x7F);
+ } else {
+ ccAscii.bs.setRange(0x00, 0x7F);
}
+ ccWork.and(ccAscii, enc);
}
- return;
+ or(ccWork, enc);
+ } else {
+ addCTypeByRange(ctype, not, enc, sbOut.value, ranges);
}
- }
-
- int[]ranges = enc.ctypeCodeRange(ctype, sbOut);
- if (ranges != null) {
- addCTypeByRange(ctype, not, enc, sbOut.value, ranges);
return;
}
+ int maxCode = asciiRange ? 0x80 : BitSet.SINGLE_BYTE_SIZE;
switch(ctype) {
case CharacterType.ALPHA:
case CharacterType.BLANK:
@@ -319,30 +311,29 @@ public final class CClassNode extends Node {
case CharacterType.PRINT:
if (not) {
for (int c=0; c<BitSet.SINGLE_BYTE_SIZE; c++) {
- if (!enc.isCodeCType(c, ctype)) bs.set(c);
+ if (!enc.isCodeCType(c, ctype) || c >= maxCode) bs.set(c);
}
+ if (asciiRange) addAllMultiByteRange(enc);
} else {
- for (int c=0; c<BitSet.SINGLE_BYTE_SIZE; c++) {
+ for (int c=0; c<maxCode; c++) {
if (enc.isCodeCType(c, ctype)) bs.set(c);
}
- addAllMultiByteRange(enc);
+ if (!asciiRange) addAllMultiByteRange(enc);
}
break;
case CharacterType.WORD:
if (!not) {
- for (int c=0; c<BitSet.SINGLE_BYTE_SIZE; c++) {
+ for (int c=0; c<maxCode; c++) {
if (enc.isSbWord(c)) bs.set(c);
}
-
- addAllMultiByteRange(enc);
+ if (!asciiRange) addAllMultiByteRange(enc);
} else {
for (int c=0; c<BitSet.SINGLE_BYTE_SIZE; c++) {
- try {
- if (enc.codeToMbcLength(c) > 0 && /* check invalid code point */
- !enc.isWord(c)) bs.set(c);
- } catch (EncodingException ve) {};
+ if (enc.codeToMbcLength(c) > 0 && /* check invalid code point */
+ !(enc.isWord(c) || c >= maxCode)) bs.set(c);
}
+ if (asciiRange) addAllMultiByteRange(enc);
}
break;
@@ -365,47 +356,49 @@ public final class CClassNode extends Node {
}
public static final class CCStateArg {
- public int v;
- public int vs;
- public boolean vsIsRaw;
- public boolean vIsRaw;
+ public int from;
+ public int to;
+ public boolean fromIsRaw;
+ public boolean toIsRaw;
public CCVALTYPE inType;
public CCVALTYPE type;
public CCSTATE state;
}
- public void nextStateClass(CCStateArg arg, ScanEnvironment env) {
+ public void nextStateClass(CCStateArg arg, CClassNode ascCC, ScanEnvironment env) {
if (arg.state == CCSTATE.RANGE) throw new SyntaxException(ErrorMessages.ERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE);
if (arg.state == CCSTATE.VALUE && arg.type != CCVALTYPE.CLASS) {
if (arg.type == CCVALTYPE.SB) {
- bs.set(arg.vs);
+ bs.set(arg.from);
+ if (ascCC != null) ascCC.bs.set(arg.from);
} else if (arg.type == CCVALTYPE.CODE_POINT) {
- addCodeRange(env, arg.vs, arg.vs);
+ addCodeRange(env, arg.from, arg.from);
+ if (ascCC != null) ascCC.addCodeRange(env, arg.from, arg.from); // add_code_range0
}
}
arg.state = CCSTATE.VALUE;
arg.type = CCVALTYPE.CLASS;
}
- public void nextStateValue(CCStateArg arg, ScanEnvironment env) {
-
+ public void nextStateValue(CCStateArg arg, CClassNode ascCc, ScanEnvironment env) {
switch(arg.state) {
case VALUE:
if (arg.type == CCVALTYPE.SB) {
- if (arg.vs > 0xff) throw new ValueException(ErrorMessages.ERR_INVALID_CODE_POINT_VALUE);
- bs.set(arg.vs);
+ bs.set(arg.from);
+ if (ascCc != null) ascCc.bs.set(arg.from);
} else if (arg.type == CCVALTYPE.CODE_POINT) {
- addCodeRange(env, arg.vs, arg.vs);
+ addCodeRange(env, arg.from, arg.from);
+ if (ascCc != null) ascCc.addCodeRange(env, arg.from, arg.from); // add_code_range0
}
break;
case RANGE:
if (arg.inType == arg.type) {
if (arg.inType == CCVALTYPE.SB) {
- if (arg.vs > 0xff || arg.v > 0xff) throw new ValueException(ErrorMessages.ERR_INVALID_CODE_POINT_VALUE);
+ if (arg.from > 0xff || arg.to > 0xff) throw new ValueException(ErrorMessages.ERR_INVALID_CODE_POINT_VALUE);
- if (arg.vs > arg.v) {
+ if (arg.from > arg.to) {
if (env.syntax.allowEmptyRangeInCC()) {
// goto ccs_range_end
arg.state = CCSTATE.COMPLETE;
@@ -414,12 +407,14 @@ public final class CClassNode extends Node {
throw new ValueException(ErrorMessages.ERR_EMPTY_RANGE_IN_CHAR_CLASS);
}
}
- bs.setRange(arg.vs, arg.v);
+ bs.setRange(arg.from, arg.to);
+ if (ascCc != null) ascCc.bs.setRange(arg.from, arg.to);
} else {
- addCodeRange(env, arg.vs, arg.v);
+ addCodeRange(env, arg.from, arg.to);
+ if (ascCc != null) ascCc.addCodeRange(env, arg.from, arg.to); // add_code_range0
}
} else {
- if (arg.vs > arg.v) {
+ if (arg.from > arg.to) {
if (env.syntax.allowEmptyRangeInCC()) {
// goto ccs_range_end
arg.state = CCSTATE.COMPLETE;
@@ -428,8 +423,12 @@ public final class CClassNode extends Node {
throw new ValueException(ErrorMessages.ERR_EMPTY_RANGE_IN_CHAR_CLASS);
}
}
- bs.setRange(arg.vs, arg.v < 0xff ? arg.v : 0xff);
- addCodeRange(env, arg.vs, arg.v);
+ bs.setRange(arg.from, arg.to < 0xff ? arg.to : 0xff);
+ addCodeRange(env, arg.from, arg.to);
+ if (ascCc != null) {
+ ascCc.bs.setRange(arg.from, arg.to < 0xff ? arg.to : 0xff);
+ ascCc.addCodeRange(env, arg.from, arg.to); // add_code_range0
+ }
}
// ccs_range_end:
arg.state = CCSTATE.COMPLETE;
@@ -445,8 +444,8 @@ public final class CClassNode extends Node {
} // switch
- arg.vsIsRaw = arg.vIsRaw;
- arg.vs = arg.v;
+ arg.fromIsRaw = arg.toIsRaw;
+ arg.from = arg.to;
arg.type = arg.inType;
}
@@ -497,7 +496,7 @@ public final class CClassNode extends Node {
private static int CR_FROM(int[] range, int i) {
return range[(i * 2) + 1];
}
-
+
private static int CR_TO(int[] range, int i) {
return range[(i * 2) + 2];
}
diff --git a/test/org/joni/test/TestU8.java b/test/org/joni/test/TestU8.java
index 04ba715..319b22c 100755
--- a/test/org/joni/test/TestU8.java
+++ b/test/org/joni/test/TestU8.java
@@ -203,26 +203,10 @@ public class TestU8 extends Test {
x2s("(?i)АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "абвгдеёжзийклмнопрстуфхцчшщъыьэюя", 0, 33 * 2);
x2s("(?i)АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", 0, 33 * 2);
- ns("(?ia)\\w+", "\u212a\u017f");
- // ns("(?ia)[\\w]+", "\u212a\u017f");
- ns("(?ia)[^\\W]+", "\u212a\u017f");
- // x2s("(?ia)[^\\W]+", "ks", 0, 2);
- ns("(?iu)\\p{ASCII}", "\u212a");
- ns("(?iu)\\P{ASCII}", "s");
- // ns("(?iu)[\\p{ASCII}]", "\u212a");
- // ns("(?iu)[\\P{ASCII}]", "s");
- ns("(?ia)\\p{ASCII}", "\u212a");
- ns("(?ia)\\P{ASCII}", "s");
- // ns("(?ia)[\\p{ASCII}]", "\u212a");
- // ns("(?ia)[\\P{ASCII}]", "s");
- x2s("(?iu)[s]+", "Ss\u017f ", 0, 4);
- x2s("(?ia)[s]+", "Ss\u017f ", 0, 4);
- x2s("(?iu)[^s]+", "Ss\u017f ", 4, 5);
- x2s("(?ia)[^s]+", "Ss\u017f ", 4, 5);
- x2s("(?iu)[[:lower:]]", "\u017f", 0, 2);
- // ns("(?ia)[[:lower:]]", "\u017f");
- x2s("(?u)[[:upper:]]", "\u212a", 0, 3);
- // ns("(?a)[[:upper:]]", "\u212a");
+ x2s("(?iu)\\p{lower}\\p{upper}", "Ab", 0, 2);
+ x2s("(?ia)\\p{lower}\\p{upper}", "Ab", 0, 2);
+ x2s("(?iu)[[:lower:]][[:upper:]]", "Ab", 0, 2);
+ x2s("(?ia)[[:lower:]][[:upper:]]", "Ab", 0, 2);
super.test();
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git
More information about the pkg-java-commits
mailing list