[jruby-joni] 90/279: Bump dependency version for jcodings and support nonunicode \s \d \w
Hideki Yamane
henrich at moszumanska.debian.org
Mon Nov 16 11:26:57 UTC 2015
This is an automated email from the git hooks/post-receive script.
henrich pushed a commit to branch debian/sid
in repository jruby-joni.
commit 7d7a5189d47bed7b5ec47e9f01b428706eceb249
Author: Marcin Mielzynski <lopx at gazeta.pl>
Date: Mon Feb 13 01:36:57 2012 +0100
Bump dependency version for jcodings and support nonunicode \s \d \w
---
pom.xml | 2 +-
src/org/joni/Config.java | 56 +++----
src/org/joni/Lexer.java | 336 +++++++++++++++++++--------------------
src/org/joni/Parser.java | 319 ++++++++++++++++++++-----------------
src/org/joni/ast/CClassNode.java | 25 ++-
5 files changed, 392 insertions(+), 346 deletions(-)
diff --git a/pom.xml b/pom.xml
index dff5b07..a8a3eba 100644
--- a/pom.xml
+++ b/pom.xml
@@ -75,7 +75,7 @@
<dependency>
<groupId>org.jruby.jcodings</groupId>
<artifactId>jcodings</artifactId>
- <version>1.0.4</version>
+ <version>1.0.6</version>
</dependency>
<dependency>
<groupId>junit</groupId>
diff --git a/src/org/joni/Config.java b/src/org/joni/Config.java
index 07762f0..f1f4947 100644
--- a/src/org/joni/Config.java
+++ b/src/org/joni/Config.java
@@ -1,20 +1,20 @@
/*
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
- *
+ *
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.joni;
@@ -23,25 +23,25 @@ import java.io.PrintStream;
public interface Config extends org.jcodings.Config {
final int CHAR_TABLE_SIZE = 256;
-
+
final boolean USE_NAMED_GROUP = true;
final boolean USE_SUBEXP_CALL = true;
final boolean USE_BACKREF_WITH_LEVEL = true; /* \k<name+n>, \k<name-n> */
-
+
final boolean USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT = true; /* /(?:()|())*\2/ */
final boolean USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE = true; /* /\n$/ =~ "\n" */
final boolean USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR = false;
final boolean CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS = true;
-
+
final boolean USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE = false;
final boolean USE_CAPTURE_HISTORY = false;
final boolean USE_VARIABLE_META_CHARS = true;
final boolean USE_WORD_BEGIN_END = true; /* "\<": word-begin, "\>": word-end */
- final boolean USE_POSIX_API_REGION_OPTION = true; /* needed for POSIX API support */
+ final boolean USE_POSIX_API_REGION_OPTION = true; /* needed for POSIX API support */
final boolean USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE = true;
final boolean USE_COMBINATION_EXPLOSION_CHECK = false;
-
+
final int NREGION = 10;
final int MAX_BACKREF_NUM = 1000;
final int MAX_REPEAT_NUM = 100000;
@@ -53,34 +53,36 @@ public interface Config extends org.jcodings.Config {
// internal config
final boolean USE_PARSE_TREE_NODE_RECYCLE = true;
final boolean USE_OP_PUSH_OR_JUMP_EXACT = true;
- final boolean USE_SHARED_CCLASS_TABLE = false;
- final boolean USE_QTFR_PEEK_NEXT = true;
+ final boolean USE_SHARED_CCLASS_TABLE = false;
+ final boolean USE_QTFR_PEEK_NEXT = true;
final int INIT_MATCH_STACK_SIZE = 64;
final int DEFAULT_MATCH_STACK_LIMIT_SIZE = 0; /* unlimited */
final int NUMBER_OF_POOLED_STACKS = 4;
-
-
+
+
final boolean DONT_OPTIMIZE = false;
-
-
+
+
final int MAX_CAPTURE_HISTORY_GROUP = 31;
-
+
final int CHECK_STRING_THRESHOLD_LEN = 7;
final int CHECK_BUFF_MAX_SIZE = 0x4000;
-
-
+
+ final boolean NON_UNICODE_SDW = false;
+
+
final PrintStream log = System.out;
final PrintStream err = System.err;
final boolean DEBUG_ALL = false;
- final boolean DEBUG = DEBUG_ALL;
+ final boolean DEBUG = DEBUG_ALL;
final boolean DEBUG_PARSE_TREE = DEBUG_ALL;
final boolean DEBUG_COMPILE = DEBUG_ALL;
final boolean DEBUG_COMPILE_BYTE_CODE_INFO = DEBUG_ALL;
- final boolean DEBUG_SEARCH = DEBUG_ALL;
+ final boolean DEBUG_SEARCH = DEBUG_ALL;
final boolean DEBUG_MATCH = DEBUG_ALL;
final boolean DEBUG_ASM = true;
final boolean DEBUG_ASM_EXEC = true;
diff --git a/src/org/joni/Lexer.java b/src/org/joni/Lexer.java
index 172132f..9094757 100644
--- a/src/org/joni/Lexer.java
+++ b/src/org/joni/Lexer.java
@@ -1,20 +1,20 @@
/*
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
- *
+ *
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.joni;
@@ -31,7 +31,7 @@ import org.joni.constants.TokenType;
import org.joni.exception.ErrorMessages;
class Lexer extends ScannerSupport {
- protected final ScanEnvironment env;
+ protected final ScanEnvironment env;
protected final Syntax syntax; // fast access to syntax
protected final Token token = new Token(); // current token
@@ -40,17 +40,17 @@ class Lexer extends ScannerSupport {
this.env = env;
this.syntax = env.syntax;
}
-
+
/**
* @return 0: normal {n,m}, 2: fixed {n}
- * !introduce returnCode here
+ * !introduce returnCode here
*/
private int fetchRangeQuantifier() {
mark();
boolean synAllow = syntax.allowInvalidInterval();
-
+
if (!left()) {
- if (synAllow) {
+ if (synAllow) {
return 1; /* "....{" : OK! */
} else {
newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE);
@@ -63,7 +63,7 @@ class Lexer extends ScannerSupport {
newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE);
}
}
-
+
int low = scanUnsignedNumber();
if (low < 0) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
if (low > Config.MAX_REPEAT_NUM) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
@@ -77,18 +77,18 @@ class Lexer extends ScannerSupport {
return invalidRangeQuantifier(synAllow);
}
}
-
+
if (!left()) return invalidRangeQuantifier(synAllow);
-
+
fetch();
int up;
int ret = 0;
if (c == ',') {
- int prev = p; // ??? last
+ int prev = p; // ??? last
up = scanUnsignedNumber();
if (up < 0) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
if (up > Config.MAX_REPEAT_NUM) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
-
+
if (p == prev) {
if (nonLow) return invalidRangeQuantifier(synAllow);
up = QuantifierNode.REPEAT_INFINITE; /* {n,} : {n,infinite} */
@@ -99,28 +99,28 @@ class Lexer extends ScannerSupport {
up = low; /* {n} : exact n times */
ret = 2; /* fixed */
}
-
+
if (!left()) return invalidRangeQuantifier(synAllow);
fetch();
-
+
if (syntax.opEscBraceInterval()) {
if (c != syntax.metaCharTable.esc) return invalidRangeQuantifier(synAllow);
fetch();
}
-
+
if (c != '}') return invalidRangeQuantifier(synAllow);
-
+
if (!isRepeatInfinite(up) && low > up) {
newValueException(ERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE);
}
-
+
token.type = TokenType.INTERVAL;
token.setRepeatLower(low);
token.setRepeatUpper(up);
-
+
return ret; /* 0: normal {n,m}, 2: fixed {n} */
}
-
+
private int invalidRangeQuantifier(boolean synAllow) {
if (synAllow) {
restore();
@@ -130,7 +130,7 @@ class Lexer extends ScannerSupport {
return 0; // not reached
}
}
-
+
/* \M-, \C-, \c, or \... */
private int fetchEscapedValue() {
if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE);
@@ -164,20 +164,20 @@ class Lexer extends ScannerSupport {
fetchEscapedValueBackSlash();
}
break;
-
+
case 'c':
if (syntax.opEscCControl()) {
fetchEscapedValueControl();
}
/* fall through */
-
+
default:
fetchEscapedValueBackSlash();
} // switch
-
+
return c; // ???
}
-
+
private void fetchEscapedValueBackSlash() {
c = env.convertBackslashValue(c);
}
@@ -194,7 +194,7 @@ class Lexer extends ScannerSupport {
c &= 0x9f;
}
}
-
+
private int nameEndCodePoint(int start) {
switch(start) {
case '<':
@@ -212,16 +212,16 @@ class Lexer extends ScannerSupport {
\k<num+n>, \k<num-n>
\k<-num+n>, \k<-num-n>
*/
-
+
// value implicit (rnameEnd)
private boolean fetchNameWithLevel(int startCode, int[]rbackNum, int[]rlevel) {
int src = p;
boolean existLevel = false;
int isNum = 0;
int sign = 1;
-
+
int endCode = nameEndCodePoint(startCode);
- int pnumHead = p;
+ int pnumHead = p;
int nameEnd = stop;
String err = null;
@@ -232,15 +232,15 @@ class Lexer extends ScannerSupport {
if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME);
if (enc.isDigit(c)) {
isNum = 1;
- } else if (c == '-') {
+ } else if (c == '-') {
isNum = 2;
sign = -1;
pnumHead = p;
- } else if (!enc.isWord(c)) {
+ } else if (!enc.isWord(c)) {
err = ERR_INVALID_GROUP_NAME;
}
}
-
+
while (left()) {
nameEnd = p;
fetch();
@@ -248,7 +248,7 @@ class Lexer extends ScannerSupport {
if (isNum == 2) err = ERR_INVALID_GROUP_NAME;
break;
}
-
+
if (isNum != 0) {
if (enc.isDigit(c)) {
isNum = 1;
@@ -273,11 +273,11 @@ class Lexer extends ScannerSupport {
if (level < 0) newValueException(ERR_TOO_BIG_NUMBER);
rlevel[0] = level * flag;
existLevel = true;
-
+
fetch();
isEndCode = c == endCode;
}
-
+
if (!isEndCode) {
err = ERR_INVALID_GROUP_NAME;
nameEnd = stop;
@@ -295,7 +295,7 @@ class Lexer extends ScannerSupport {
} else if (backNum == 0) {
newValueException(ERR_INVALID_GROUP_NAME, src, stop);
}
- rbackNum[0] = backNum * sign;
+ rbackNum[0] = backNum * sign;
}
value = nameEnd;
return existLevel;
@@ -304,14 +304,14 @@ class Lexer extends ScannerSupport {
return false; // not reached
}
}
-
+
// USE_NAMED_GROUP
// ref: 0 -> define name (don't allow number name)
// 1 -> reference name (allow number name)
private int fetchNameForNamedGroup(int startCode, boolean ref) {
int src = p;
value = 0;
-
+
int isNum = 0;
int sign = 1;
@@ -332,7 +332,7 @@ class Lexer extends ScannerSupport {
err = ERR_INVALID_GROUP_NAME;
// isNum = 0;
}
- } else if (c == '-') {
+ } else if (c == '-') {
if (ref) {
isNum = 2;
sign = -1;
@@ -342,10 +342,10 @@ class Lexer extends ScannerSupport {
// isNum = 0;
}
} else if (!enc.isWord(c)) {
- err = ERR_INVALID_CHAR_IN_GROUP_NAME;
+ err = ERR_INVALID_CHAR_IN_GROUP_NAME;
}
}
-
+
if (err == null) {
while (left()) {
nameEnd = p;
@@ -354,7 +354,7 @@ class Lexer extends ScannerSupport {
if (isNum == 2) err = ERR_INVALID_GROUP_NAME;
break;
}
-
+
if (isNum != 0) {
if (enc.isDigit(c)) {
isNum = 1;
@@ -372,7 +372,7 @@ class Lexer extends ScannerSupport {
}
}
}
-
+
if (c != endCode) {
err = ERR_INVALID_GROUP_NAME;
nameEnd = stop;
@@ -410,12 +410,12 @@ class Lexer extends ScannerSupport {
private final int fetchNameForNoNamedGroup(int startCode, boolean ref) {
int src = p;
value = 0;
-
+
int isNum = 0;
int sign = 1;
-
+
int endCode = nameEndCodePoint(startCode);
- int pnumHead = p;
+ int pnumHead = p;
int nameEnd = stop;
String err = null;
@@ -424,7 +424,7 @@ class Lexer extends ScannerSupport {
} else {
fetch();
if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME);
-
+
if (enc.isDigit(c)) {
isNum = 1;
} else if (c == '-') {
@@ -438,17 +438,17 @@ class Lexer extends ScannerSupport {
while(left()) {
nameEnd = p;
-
+
fetch();
if (c == endCode || c == ')') break;
if (!enc.isDigit(c)) err = ERR_INVALID_CHAR_IN_GROUP_NAME;
}
-
- if (err == null && c != endCode) {
+
+ if (err == null && c != endCode) {
err = ERR_INVALID_GROUP_NAME;
nameEnd = stop;
}
-
+
if (err == null) {
mark();
p = pnumHead;
@@ -460,7 +460,7 @@ class Lexer extends ScannerSupport {
newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd);
}
backNum *= sign;
-
+
value = nameEnd;
return backNum;
} else {
@@ -468,7 +468,7 @@ class Lexer extends ScannerSupport {
return 0; // not reached
}
}
-
+
protected final int fetchName(int startCode, boolean ref) {
if (Config.USE_NAMED_GROUP) {
return fetchNameForNamedGroup(startCode, ref);
@@ -476,11 +476,11 @@ class Lexer extends ScannerSupport {
return fetchNameForNoNamedGroup(startCode, ref);
}
}
-
+
private boolean strExistCheckWithEsc(int[]s, int n, int bad) {
int p = this.p;
int to = this.stop;
-
+
boolean inEsc = false;
int i=0;
@@ -508,14 +508,14 @@ class Lexer extends ScannerSupport {
}
}
return false;
- }
-
- private static final int send[] = new int[]{':', ']'};
-
+ }
+
+ private static final int send[] = new int[]{':', ']'};
+
protected final TokenType fetchTokenInCC() {
int last;
int c2;
-
+
if (!left()) {
token.type = TokenType.EOT;
return token.type;
@@ -526,7 +526,7 @@ class Lexer extends ScannerSupport {
token.base = 0;
token.setC(c);
token.escaped = false;
-
+
if (c == ']') {
token.type = TokenType.CC_CLOSE;
} else if (c == '-') {
@@ -539,40 +539,40 @@ class Lexer extends ScannerSupport {
token.setC(c);
switch (c) {
-
+
case 'w':
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.WORD);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
token.setPropNot(false);
break;
-
+
case 'W':
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.WORD);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
token.setPropNot(true);
break;
-
+
case 'd':
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.DIGIT);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
token.setPropNot(false);
break;
case 'D':
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.DIGIT);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
token.setPropNot(true);
break;
case 's':
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.SPACE);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
token.setPropNot(false);
break;
-
+
case 'S':
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.SPACE);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
token.setPropNot(true);
break;
@@ -589,41 +589,41 @@ class Lexer extends ScannerSupport {
token.setPropCType(CharacterType.XDIGIT);
token.setPropNot(true);
break;
-
+
case 'p':
case 'P':
- c2 = peek(); // !!! migrate to peekIs
+ c2 = peek(); // !!! migrate to peekIs
if (c2 == '{' && syntax.op2EscPBraceCharProperty()) {
inc();
token.type = TokenType.CHAR_PROPERTY;
token.setPropNot(c == 'P');
-
+
if (syntax.op2EscPBraceCircumflexNot()) {
c2 = fetchTo();
if (c2 == '^') {
- token.setPropNot(!token.getPropNot());
+ token.setPropNot(!token.getPropNot());
} else {
unfetch();
}
}
}
break;
-
+
case 'x':
if (!left()) break;
last = p;
-
+
if (peekIs('{') && syntax.opEscXBraceHex8()) {
inc();
int num = scanUnsignedHexadecimalNumber(8);
if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE);
if (left()) {
c2 = peek();
- if (enc.isXDigit(c2)) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
+ if (enc.isXDigit(c2)) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
}
-
+
if (p > last + enc.length(bytes, last, stop) && left() && peekIs('}')) {
- inc();
+ inc();
token.type = TokenType.CODE_POINT;
token.base = 16;
token.setCode(num);
@@ -642,11 +642,11 @@ class Lexer extends ScannerSupport {
token.setC(num);
}
break;
-
+
case 'u':
if (!left()) break;
last = p;
-
+
if (syntax.op2EscUHex4()) {
int num = scanUnsignedHexadecimalNumber(4);
if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
@@ -658,7 +658,7 @@ class Lexer extends ScannerSupport {
token.setCode(num);
}
break;
-
+
case '0':
case '1':
case '2':
@@ -680,7 +680,7 @@ class Lexer extends ScannerSupport {
token.setC(num);
}
break;
-
+
default:
unfetch();
int num = fetchEscapedValue();
@@ -690,7 +690,7 @@ class Lexer extends ScannerSupport {
}
break;
} // switch
-
+
} else if (c == '[') {
if (syntax.opPosixBracket() && peekIs(':')) {
token.backP = p; /* point at '[' is readed */
@@ -721,24 +721,24 @@ class Lexer extends ScannerSupport {
}
return token.type;
}
-
+
protected final int backrefRelToAbs(int relNo) {
return env.numMem + 1 + relNo;
}
-
+
protected final TokenType fetchToken() {
int last;
-
+
// mark(); // out
-
+
start:
while(true) {
-
+
if (!left()) {
token.type = TokenType.EOT;
return token.type;
}
-
+
token.type = TokenType.STRING;
token.base = 0;
token.backP = p;
@@ -814,14 +814,14 @@ class Lexer extends ScannerSupport {
case 'w':
if (!syntax.opEscWWord()) break;
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.WORD);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
token.setPropNot(false);
break;
case 'W':
if (!syntax.opEscWWord()) break;
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.WORD);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
token.setPropNot(true);
break;
@@ -845,7 +845,7 @@ class Lexer extends ScannerSupport {
break;
} // USE_WORD_BEGIN_END
break; // ?
-
+
case '>':
if (Config.USE_WORD_BEGIN_END) {
if (!syntax.opEscLtGtWordBeginEnd()) break;
@@ -858,28 +858,28 @@ class Lexer extends ScannerSupport {
case 's':
if (!syntax.opEscSWhiteSpace()) break;
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.SPACE);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
token.setPropNot(false);
break;
case 'S':
if (!syntax.opEscSWhiteSpace()) break;
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.SPACE);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
token.setPropNot(true);
break;
-
+
case 'd':
if (!syntax.opEscDDigit()) break;
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.DIGIT);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
token.setPropNot(false);
break;
-
+
case 'D':
if (!syntax.opEscDDigit()) break;
token.type = TokenType.CHAR_TYPE;
- token.setPropCType(CharacterType.DIGIT);
+ token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
token.setPropNot(true);
break;
@@ -903,26 +903,26 @@ class Lexer extends ScannerSupport {
token.type = TokenType.ANCHOR;
token.setSubtype(AnchorType.BEGIN_BUF);
break;
-
+
case 'Z':
if (!syntax.opEscAZBufAnchor()) break;
token.type = TokenType.ANCHOR;
token.setSubtype(AnchorType.SEMI_END_BUF);
break;
-
+
case 'z':
if (!syntax.opEscAZBufAnchor()) break;
- // end_buf label
- token.type = TokenType.ANCHOR;
+ // end_buf label
+ token.type = TokenType.ANCHOR;
token.setSubtype(AnchorType.END_BUF);
break;
-
+
case 'G':
if (!syntax.opEscCapitalGBeginAnchor()) break;
token.type = TokenType.ANCHOR;
token.setSubtype(AnchorType.BEGIN_POSITION);
break;
-
+
case '`':
if (!syntax.op2EscGnuBufAnchor()) break;
// goto begin_buf
@@ -932,8 +932,8 @@ class Lexer extends ScannerSupport {
case '\'':
if (!syntax.op2EscGnuBufAnchor()) break;
- // goto end_buf
- token.type = TokenType.ANCHOR;
+ // goto end_buf
+ token.type = TokenType.ANCHOR;
token.setSubtype(AnchorType.END_BUF);
break;
@@ -945,9 +945,9 @@ class Lexer extends ScannerSupport {
int num = scanUnsignedHexadecimalNumber(8);
if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE);
if (left()) {
- if (enc.isXDigit(peek())) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
+ if (enc.isXDigit(peek())) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
}
-
+
if (p > last + enc.length(bytes, last, stop) && left() && peekIs('}')) {
inc();
token.type = TokenType.CODE_POINT;
@@ -967,11 +967,11 @@ class Lexer extends ScannerSupport {
token.setC(num);
}
break;
-
+
case 'u': // extract to helper
if (!left()) break;
last = p;
-
+
if (syntax.op2EscUHex4()) {
int num = scanUnsignedHexadecimalNumber(4);
if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
@@ -983,7 +983,7 @@ class Lexer extends ScannerSupport {
token.setCode(num);
}
break;
-
+
case '1':
case '2':
case '3':
@@ -992,11 +992,11 @@ class Lexer extends ScannerSupport {
case '6':
case '7':
case '8':
- case '9':
+ case '9':
unfetch();
last = p;
int num = scanUnsignedNumber();
- if (num < 0 || num > Config.MAX_BACKREF_NUM) {
+ if (num < 0 || num > Config.MAX_BACKREF_NUM) {
// goto skip_backref
} else if (syntax.opDecimalBackref() && (num <= env.numMem || num <= 9)) { /* This spec. from GNU regex */
if (syntax.strictCheckBackref()) {
@@ -1018,7 +1018,7 @@ class Lexer extends ScannerSupport {
}
p = last;
/* fall through */
-
+
case '0':
if (syntax.opEscOctal3()) {
last = p;
@@ -1034,7 +1034,7 @@ class Lexer extends ScannerSupport {
inc();
}
break;
-
+
case 'k':
if (Config.USE_NAMED_GROUP) {
if (syntax.op2EscKNamedBackref()) {
@@ -1052,13 +1052,13 @@ class Lexer extends ScannerSupport {
backNum = fetchName(c, true);
} // USE_BACKREF_AT_LEVEL
int nameEnd = value; // set by fetchNameWithLevel/fetchName
-
+
if (backNum != 0) {
if (backNum < 0) {
backNum = backrefRelToAbs(backNum);
if (backNum <= 0) newValueException(ERR_INVALID_BACKREF);
}
-
+
if (syntax.strictCheckBackref() && (backNum > env.numMem || env.memNodes == null)) {
newValueException(ERR_INVALID_BACKREF);
}
@@ -1099,11 +1099,11 @@ class Lexer extends ScannerSupport {
unfetch();
}
}
-
+
break;
} // USE_NAMED_GROUP
break;
-
+
case 'g':
if (Config.USE_SUBEXP_CALL) {
if (syntax.op2EscGSubexpCall()) {
@@ -1120,34 +1120,34 @@ class Lexer extends ScannerSupport {
unfetch();
}
}
- break;
+ break;
} // USE_SUBEXP_CALL
break;
-
+
case 'Q':
if (syntax.op2EscCapitalQQuote()) {
token.type = TokenType.QUOTE_OPEN;
}
break;
-
+
case 'p':
case 'P':
if (peekIs('{') && syntax.op2EscPBraceCharProperty()) {
inc();
token.type = TokenType.CHAR_PROPERTY;
token.setPropNot(c == 'P');
-
+
if (syntax.op2EscPBraceCircumflexNot()) {
fetch();
if (c == '^') {
- token.setPropNot(!token.getPropNot());
+ token.setPropNot(!token.getPropNot());
} else {
unfetch();
}
}
}
break;
-
+
default:
unfetch();
num = fetchEscapedValue();
@@ -1160,13 +1160,13 @@ class Lexer extends ScannerSupport {
p = token.backP + enc.length(bytes, token.backP, stop);
}
break;
-
+
} // switch (c)
-
+
} else {
token.setC(c);
token.escaped = false;
-
+
// remove code duplication
if (Config.USE_VARIABLE_META_CHARS) {
if (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters()) {
@@ -1198,16 +1198,16 @@ class Lexer extends ScannerSupport {
}
}
} // USE_VARIABLE_META_CHARS
-
- {
+
+ {
switch(c) {
-
+
case '.':
if (!syntax.opDotAnyChar()) break;
// any_char:
token.type = TokenType.ANYCHAR;
break;
-
+
case '*':
if (!syntax.opAsteriskZeroInf()) break;
// anytime:
@@ -1225,8 +1225,8 @@ class Lexer extends ScannerSupport {
token.setRepeatUpper(QuantifierNode.REPEAT_INFINITE);
greedyCheck();
break;
-
- case '?':
+
+ case '?':
if (!syntax.opQMarkZeroOne()) break;
// zero_or_one_time:
token.type = TokenType.OP_REPEAT;
@@ -1234,7 +1234,7 @@ class Lexer extends ScannerSupport {
token.setRepeatUpper(1);
greedyCheck();
break;
-
+
case '{':
if (!syntax.opBraceInterval()) break;
switch(fetchRangeQuantifier()) {
@@ -1251,12 +1251,12 @@ class Lexer extends ScannerSupport {
default: /* 1 : normal char */
} // inner switch
break;
-
+
case '|':
if (!syntax.opVBarAlt()) break;
token.type = TokenType.ALT;
break;
-
+
case '(':
if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
inc();
@@ -1275,49 +1275,49 @@ class Lexer extends ScannerSupport {
}
unfetch();
}
-
+
if (!syntax.opLParenSubexp()) break;
token.type = TokenType.SUBEXP_OPEN;
break;
-
+
case ')':
if (!syntax.opLParenSubexp()) break;
- token.type = TokenType.SUBEXP_CLOSE;
+ token.type = TokenType.SUBEXP_CLOSE;
break;
-
+
case '^':
if (!syntax.opLineAnchor()) break;
token.type = TokenType.ANCHOR;
token.setSubtype(isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE);
break;
-
+
case '$':
if (!syntax.opLineAnchor()) break;
token.type = TokenType.ANCHOR;
token.setSubtype(isSingleline(env.option) ? AnchorType.SEMI_END_BUF : AnchorType.END_LINE);
break;
-
+
case '[':
if (!syntax.opBracketCC()) break;
token.type = TokenType.CC_CC_OPEN;
break;
-
+
case ']':
//if (*src > env->pattern) /* /].../ is allowed. */
//CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
break;
-
+
case '#':
if (Option.isExtend(env.option)) {
while (left()) {
fetch();
if (enc.isNewLine(c)) break;
}
- continue start; // goto start
-
+ continue start; // goto start
+
}
break;
-
+
case ' ':
case '\t':
case '\n':
@@ -1327,22 +1327,22 @@ class Lexer extends ScannerSupport {
continue start; // goto start
}
break;
-
+
default: // string
break;
-
+
} // switch
}
}
-
+
break;
} // while
- return token.type;
+ return token.type;
}
-
+
private void greedyCheck() {
if (left() && peekIs('?') && syntax.opQMarkNonGreedy()) {
-
+
fetch();
token.setRepeatGreedy(false);
@@ -1351,14 +1351,14 @@ class Lexer extends ScannerSupport {
possessiveCheck();
}
}
-
+
private void possessiveCheck() {
- if (left() && peekIs('+') &&
+ if (left() && peekIs('+') &&
(syntax.op2PlusPossessiveRepeat() && token.type != TokenType.INTERVAL ||
syntax.op2PlusPossessiveInterval() && token.type == TokenType.INTERVAL)) {
-
+
fetch();
-
+
token.setRepeatGreedy(true);
token.setRepeatPossessive(true);
} else {
diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java
index a787d16..71d29fd 100644
--- a/src/org/joni/Parser.java
+++ b/src/org/joni/Parser.java
@@ -1,20 +1,20 @@
/*
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
- *
+ *
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.joni;
@@ -49,28 +49,28 @@ class Parser extends Lexer {
protected final Regex regex;
protected Node root;
-
+
protected int returnCode; // return code used by parser methods (they itself return parsed nodes)
- // this approach will not affect recursive calls
-
+ // this approach will not affect recursive calls
+
protected Parser(ScanEnvironment env, byte[]bytes, int p, int end) {
super(env, bytes, p, end);
regex = env.reg;
}
-
+
// onig_parse_make_tree
protected final Node parse() {
root = parseRegexp();
regex.numMem = env.numMem;
return root;
}
-
+
private static final int POSIX_BRACKET_NAME_MIN_LEN = 4;
private static final int POSIX_BRACKET_CHECK_LIMIT_LENGTH = 20;
private static final byte BRACKET_END[] = ":]".getBytes();
private boolean parsePosixBracket(CClassNode cc) {
mark();
-
+
boolean not;
if (peekIs('^')) {
inc();
@@ -94,7 +94,7 @@ class Parser extends Lexer {
return false;
}
}
-
+
}
// not_posix_bracket:
@@ -104,7 +104,7 @@ class Parser extends Lexer {
inc();
if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
}
-
+
if (c == ':' && left()) {
inc();
if (left()) {
@@ -115,7 +115,7 @@ class Parser extends Lexer {
restore();
return true; /* 1: is not POSIX bracket, but no error. */
}
-
+
private CClassNode parseCharProperty() {
int ctype = fetchCharPropertyToCType();
CClassNode n = new CClassNode();
@@ -123,28 +123,28 @@ class Parser extends Lexer {
if (token.getPropNot()) n.setNot();
return n;
}
-
+
private boolean codeExistCheck(int code, boolean ignoreEscaped) {
mark();
-
+
boolean inEsc = false;
while(left()) {
- if (ignoreEscaped && inEsc) {
+ if (ignoreEscaped && inEsc) {
inEsc = false;
} else {
fetch();
if (c == code) {
restore();
- return true;
+ return true;
}
if (c == syntax.metaCharTable.esc) inEsc = true;
}
}
-
+
restore();
return false;
}
-
+
private CClassNode parseCharClass() {
fetchTokenInCC();
@@ -155,35 +155,45 @@ class Parser extends Lexer {
} else {
neg = false;
}
-
+
if (token.type == TokenType.CC_CLOSE) {
if (!codeExistCheck(']', true)) newSyntaxException(ERR_EMPTY_CHAR_CLASS);
env.ccEscWarn("]");
token.type = TokenType.CHAR; /* allow []...] */
}
-
+
CClassNode cc = new CClassNode();
CClassNode prevCC = null;
CClassNode workCC = null;
CCStateArg arg = new CCStateArg();
-
+
boolean andStart = false;
arg.state = CCSTATE.START;
while(token.type != TokenType.CC_CLOSE) {
boolean fetched = false;
-
+
switch (token.type) {
-
+
case CHAR:
- int len = enc.codeToMbcLength(token.getC());
- if (len > 1) {
- arg.inType = CCVALTYPE.CODE_POINT;
- } else {
- // !sb_char:!
- arg.inType = CCVALTYPE.SB;
- }
+ int len;
+// if (Config.VANILLA) {
+ len = enc.codeToMbcLength(token.getC());
+ if (len > 1) {
+ arg.inType = CCVALTYPE.CODE_POINT;
+ } else {
+ // !sb_char:!
+ arg.inType = CCVALTYPE.SB;
+ }
+// } else {
+// if (token.getCode() >= BitSet.SINGLE_BYTE_SIZE || (len = enc.codeToMbcLength(token.getC())) > 1) {
+// arg.inType = CCVALTYPE.CODE_POINT;
+// } else {
+// // !sb_char:!
+// arg.inType = CCVALTYPE.SB;
+// }
+// }
arg.v = token.getC();
arg.vIsRaw = false;
// !goto val_entry2;!
@@ -207,9 +217,9 @@ class Parser extends Lexer {
buf[i] = (byte)token.getC();
}
if (i < enc.minLength()) newValueException(ERR_TOO_SHORT_MULTI_BYTE_STRING);
-
+
len = enc.length(buf, 0, i);
- if (i < len) {
+ if (i < len) {
newValueException(ERR_TOO_SHORT_MULTI_BYTE_STRING);
} else if (i > len) { /* fetch back */
p = psave;
@@ -233,7 +243,7 @@ class Parser extends Lexer {
// !goto val_entry2;!
valEntry2(cc, arg);
break;
-
+
case CODE_POINT:
arg.v = token.getCode();
arg.vIsRaw = true;
@@ -241,7 +251,7 @@ class Parser extends Lexer {
// !val_entry2:!
valEntry(cc, arg);
break;
-
+
case POSIX_BRACKET_OPEN:
if (parsePosixBracket(cc)) { /* true: is not POSIX bracket */
env.ccEscWarn("[");
@@ -255,20 +265,20 @@ class Parser extends Lexer {
// !goto next_class;!
cc.nextStateClass(arg, env);
break;
-
+
case CHAR_TYPE:
cc.addCType(token.getPropCType(), token.getPropNot(), env, this);
// !next_class:!
cc.nextStateClass(arg, env);
break;
-
+
case CHAR_PROPERTY:
int ctype = fetchCharPropertyToCType();
cc.addCType(ctype, token.getPropNot(), env, this);
// !goto next_class;!
cc.nextStateClass(arg, env);
break;
-
+
case CC_RANGE:
if (arg.state == CCSTATE.VALUE) {
fetchTokenInCC();
@@ -315,7 +325,7 @@ class Parser extends Lexer {
rangeEndVal(cc, arg);
break;
}
-
+
if (syntax.allowDoubleRangeOpInCC()) {
env.ccEscWarn("-");
/* [0-9-a] is allowed as [0-9\-a] */
@@ -326,12 +336,12 @@ class Parser extends Lexer {
newSyntaxException(ERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS);
}
break;
-
+
case CC_CC_OPEN: /* [ */
CClassNode acc = parseCharClass();
cc.or(acc, enc);
break;
-
+
case CC_AND: /* && */
if (arg.state == CCSTATE.VALUE) {
arg.v = 0; // ??? safe v ?
@@ -349,36 +359,36 @@ class Parser extends Lexer {
cc = workCC;
}
// initialize_cclass(cc); // clear it ??
- break;
-
+ break;
+
case EOT:
newSyntaxException(ERR_PREMATURE_END_OF_CHAR_CLASS);
-
- default:
- newInternalException(ERR_PARSER_BUG);
+
+ default:
+ newInternalException(ERR_PARSER_BUG);
} // switch
-
+
if (!fetched) fetchTokenInCC();
-
+
} // while
-
+
if (arg.state == CCSTATE.VALUE) {
arg.v = 0; // ??? safe v ?
arg.vIsRaw = false;
cc.nextStateValue(arg, env);
}
-
+
if (prevCC != null) {
prevCC.and(cc, enc);
cc = prevCC;
}
-
+
if (neg) {
cc.setNot();
} else {
cc.clearNot();
}
-
+
if (cc.isNot() && syntax.notNewlineInNegativeCC()) {
if (!cc.isEmpty()) {
final int NEW_LINE = 0x0a;
@@ -391,21 +401,21 @@ class Parser extends Lexer {
}
}
}
-
+
return cc;
}
-
+
private void valEntry2(CClassNode cc, CCStateArg arg) {
cc.nextStateValue(arg, env);
}
-
+
private void valEntry(CClassNode cc, CCStateArg arg) {
int len = enc.codeToMbcLength(arg.v);
arg.inType = len == 1 ? CCVALTYPE.SB : CCVALTYPE.CODE_POINT;
// !val_entry2:!
valEntry2(cc, arg);
}
-
+
private void sbChar(CClassNode cc, CCStateArg arg) {
arg.inType = CCVALTYPE.SB;
arg.v = token.getC();
@@ -420,20 +430,20 @@ class Parser extends Lexer {
// !goto val_entry;!
valEntry(cc, arg);
}
-
+
private Node parseEnclose(TokenType term) {
Node node = null;
-
+
if (!left()) newSyntaxException(ERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS);
-
+
int option = env.option;
-
- if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
+
+ if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
inc();
if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP);
-
+
boolean listCapture = false;
-
+
fetch();
switch(c) {
case ':': /* (?:...) grouping only */
@@ -442,19 +452,19 @@ class Parser extends Lexer {
node = parseSubExp(term);
returnCode = 1; /* group */
return node;
-
+
case '=':
node = new AnchorNode(AnchorType.PREC_READ);
break;
-
+
case '!': /* preceding read */
node = new AnchorNode(AnchorType.PREC_READ_NOT);
break;
-
+
case '>': /* (?>...) stop backtrack */
node = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose
break;
-
+
case '\'':
if (Config.USE_NAMED_GROUP) {
if (syntax.op2QMarkLtNamedGroup()) {
@@ -479,7 +489,7 @@ class Parser extends Lexer {
if (syntax.op2QMarkLtNamedGroup()) {
unfetch();
c = '<';
-
+
// !named_group1:!
listCapture = false;
// !named_group2:!
@@ -488,17 +498,17 @@ class Parser extends Lexer {
} else {
newSyntaxException(ERR_UNDEFINED_GROUP_OPTION);
}
-
+
} else { // USE_NAMED_GROUP
newSyntaxException(ERR_UNDEFINED_GROUP_OPTION);
} // USE_NAMED_GROUP
}
break;
-
+
case '@':
- if (syntax.op2AtMarkCaptureHistory()) {
+ if (syntax.op2AtMarkCaptureHistory()) {
if (Config.USE_NAMED_GROUP) {
- if (syntax.op2QMarkLtNamedGroup()) {
+ if (syntax.op2QMarkLtNamedGroup()) {
fetch();
if (c == '<' || c == '\'') {
listCapture = true;
@@ -518,7 +528,7 @@ class Parser extends Lexer {
newSyntaxException(ERR_UNDEFINED_GROUP_OPTION);
}
break;
-
+
// case 'p': #ifdef USE_POSIXLINE_OPTION
case '-':
case 'i':
@@ -531,19 +541,19 @@ class Parser extends Lexer {
case ':':
case ')':
break;
-
+
case '-':
neg = true;
break;
-
+
case 'x':
option = bsOnOff(option, Option.EXTEND, neg);
break;
-
+
case 'i':
option = bsOnOff(option, Option.IGNORECASE, neg);
break;
-
+
case 's':
if (syntax.op2OptionPerl()) {
option = bsOnOff(option, Option.MULTILINE, neg);
@@ -551,7 +561,7 @@ class Parser extends Lexer {
newSyntaxException(ERR_UNDEFINED_GROUP_OPTION);
}
break;
-
+
case 'm':
if (syntax.op2OptionPerl()) {
option = bsOnOff(option, Option.SINGLELINE, !neg);
@@ -561,15 +571,15 @@ class Parser extends Lexer {
newSyntaxException(ERR_UNDEFINED_GROUP_OPTION);
}
break;
-
+
// case 'p': #ifdef USE_POSIXLINE_OPTION // not defined
// option = bsOnOff(option, Option.MULTILINE|Option.SINGLELINE, neg);
// break;
-
+
default:
newSyntaxException(ERR_UNDEFINED_GROUP_OPTION);
} // switch
-
+
if (c == ')') {
EncloseNode en = new EncloseNode(option, 0); // node_new_option
node = en;
@@ -590,11 +600,11 @@ class Parser extends Lexer {
if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP);
fetch();
} // while
-
+
default:
newSyntaxException(ERR_UNDEFINED_GROUP_OPTION);
} // switch
-
+
} else {
if (isDontCaptureGroup(env.option)) {
// !goto group;!
@@ -608,7 +618,7 @@ class Parser extends Lexer {
en.regNum = num;
node = en;
}
-
+
fetchToken();
Node target = parseSubExp(term);
@@ -626,25 +636,25 @@ class Parser extends Lexer {
returnCode = 0;
return node; // ??
}
-
+
private Node namedGroup2(boolean listCapture) {
int nm = p;
int num = fetchName(c, false);
int nameEnd = value;
num = env.addMemEntry();
if (listCapture && num >= BitStatus.BIT_STATUS_BITS_NUM) newValueException(ERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY);
-
+
regex.nameAdd(bytes, nm, nameEnd, num, syntax);
EncloseNode en = new EncloseNode(env.option, true); // node_new_enclose_memory
en.regNum = num;
Node node = en;
-
+
if (listCapture) env.captureHistory = bsOnAtSimple(env.captureHistory, num);
env.numNamed++;
return node;
}
-
+
private int nextChar; // hidden var
private int findStrPosition(int[]s, int n, int from, int to) {
int x;
@@ -661,7 +671,7 @@ class Parser extends Lexer {
q += enc.length(bytes, q, to);
}
if (i >= n) {
- if (bytes[nextChar] != 0) nextChar = q; // we may need zero term semantics...
+ if (bytes[nextChar] != 0) nextChar = q; // we may need zero term semantics...
return p;
}
}
@@ -669,13 +679,13 @@ class Parser extends Lexer {
}
return -1;
}
-
+
private Node parseExp(TokenType term) {
if (token.type == term) {
//!goto end_of_token;!
return new StringNode();
}
-
+
Node node = null;
boolean group = false;
@@ -684,7 +694,7 @@ class Parser extends Lexer {
case EOT:
// !end_of_token:!
return new StringNode(); // node_new_empty
-
+
case SUBEXP_OPEN:
node = parseEnclose(TokenType.SUBEXP_CLOSE);
if (returnCode == 1) {
@@ -697,13 +707,13 @@ class Parser extends Lexer {
Node target = parseSubExp(term);
env.option = prev;
en.setTarget(target);
- return node;
+ return node;
}
break;
-
+
case SUBEXP_CLOSE:
if (!syntax.allowUnmatchedCloseSubexp()) newSyntaxException(ERR_UNMATCHED_CLOSE_PARENTHESIS);
-
+
if (token.escaped) {
// !goto tk_raw_byte;!
return parseExpTkRawByte(group);
@@ -711,22 +721,22 @@ class Parser extends Lexer {
// !goto tk_byte;!
return parseExpTkByte(group);
}
-
+
case STRING:
// !tk_byte:!
return parseExpTkByte(group);
-
+
case RAW_BYTE:
// !tk_raw_byte:!
return parseExpTkRawByte(group);
-
+
case CODE_POINT:
byte[]buf = new byte[Config.ENC_CODE_TO_MBC_MAXLEN];
int num = enc.codeToMbc(token.getCode(), buf, 0);
- // #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG ... // setRaw() #else
+ // #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG ... // setRaw() #else
node = new StringNode(buf, 0, num);
break;
-
+
case QUOTE_OPEN:
int[]endOp = new int[]{syntax.metaCharTable.esc, 'E'};
int qstart = p;
@@ -740,10 +750,21 @@ class Parser extends Lexer {
case CHAR_TYPE:
switch(token.getPropCType()) {
+ case CharacterType.D:
+ case CharacterType.S:
+ case CharacterType.W:
+ if (Config.NON_UNICODE_SDW) {
+ CClassNode cc = new CClassNode();
+ cc.addCType(token.getPropCType(), false, env, this);
+ if (token.getPropNot()) cc.setNot();
+ node = cc;
+ }
+ break;
+
case CharacterType.WORD:
node = new CTypeNode(token.getPropCType(), token.getPropNot());
break;
-
+
case CharacterType.SPACE:
case CharacterType.DIGIT:
case CharacterType.XDIGIT:
@@ -753,41 +774,41 @@ class Parser extends Lexer {
if (token.getPropNot()) ccn.setNot();
node = ccn;
break;
-
+
default:
newInternalException(ERR_PARSER_BUG);
-
+
} // inner switch
break;
-
+
case CHAR_PROPERTY:
node = parseCharProperty();
break;
-
+
case CC_CC_OPEN:
CClassNode cc = parseCharClass();
node = cc;
if (isIgnoreCase(env.option)) {
ApplyCaseFoldArg arg = new ApplyCaseFoldArg(env, cc);
enc.applyAllCaseFold(env.caseFoldFlag, ApplyCaseFold.INSTANCE, arg);
-
+
if (arg.altRoot != null) {
node = ConsAltNode.newAltNode(node, arg.altRoot);
}
}
break;
-
+
case ANYCHAR:
node = new AnyCharNode();
break;
-
+
case ANYCHAR_ANYTIME:
node = new AnyCharNode();
QuantifierNode qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
qn.setTarget(node);
node = qn;
break;
-
+
case BACKREF:
int[]backRefs = token.getBackrefNum() > 1 ? token.getBackrefRefs() : new int[]{token.getBackrefRef1()};
node = new BackRefNode(token.getBackrefNum(),
@@ -796,9 +817,9 @@ class Parser extends Lexer {
token.getBackrefExistLevel(), // #ifdef USE_BACKREF_AT_LEVEL
token.getBackrefLevel(), // ...
env);
-
+
break;
-
+
case CALL:
if (Config.USE_SUBEXP_CALL) {
int gNum = token.getCallGNum();
@@ -816,7 +837,7 @@ class Parser extends Lexer {
case ANCHOR:
node = new AnchorNode(token.getAnchor()); // possible bug in oniguruma
break;
-
+
case OP_REPEAT:
case INTERVAL:
if (syntax.contextIndepRepeatOps()) {
@@ -830,75 +851,75 @@ class Parser extends Lexer {
return parseExpTkByte(group);
}
break;
-
+
default:
newInternalException(ERR_PARSER_BUG);
} //switch
-
+
//targetp = node;
-
+
// !re_entry:!
fetchToken();
-
+
// !repeat:!
return parseExpRepeat(node, group);
}
-
+
private Node parseExpTkByte(boolean group) {
// !tk_byte:!
StringNode node = new StringNode(bytes, token.backP, p);
while (true) {
fetchToken();
if (token.type != TokenType.STRING) break;
-
+
if (token.backP == node.end) {
node.end = p; // non escaped character, remain shared, just increase shared range
} else {
- node.cat(bytes, token.backP, p); // non continuous string stream, need to COW
+ node.cat(bytes, token.backP, p); // non continuous string stream, need to COW
}
- }
+ }
// !string_end:!
// targetp = node;
// !goto repeat;!
return parseExpRepeat(node, group);
}
-
+
private Node parseExpTkRawByte(boolean group) {
// !tk_raw_byte:!
// important: we don't use 0xff mask here neither in the compiler
// (in the template string) so we won't have to mask target
- // strings when comparing against them in the matcher
+ // strings when comparing against them in the matcher
StringNode node = new StringNode((byte)token.getC());
node.setRaw();
- int len = 1;
+ int len = 1;
while (true) {
- if (len >= enc.minLength()) {
- if (len == enc.length(node.bytes, node.p, node.end)) {
+ if (len >= enc.minLength()) {
+ if (len == enc.length(node.bytes, node.p, node.end)) {
fetchToken();
node.clearRaw();
// !goto string_end;!
return parseExpRepeat(node, group);
}
}
-
+
fetchToken();
if (token.type != TokenType.RAW_BYTE) {
/* Don't use this, it is wrong for little endian encodings. */
// USE_PAD_TO_SHORT_BYTE_CHAR ...
-
+
newValueException(ERR_TOO_SHORT_MULTI_BYTE_STRING);
}
// important: we don't use 0xff mask here neither in the compiler
// (in the template string) so we won't have to mask target
- // strings when comparing against them in the matcher
+ // strings when comparing against them in the matcher
node.cat((byte)token.getC());
len++;
} // while
}
-
+
private Node parseExpRepeat(Node target, boolean group) {
// !repeat:!
while (token.type == TokenType.OP_REPEAT || token.type == TokenType.INTERVAL) {
@@ -907,11 +928,11 @@ class Parser extends Lexer {
QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(),
token.getRepeatUpper(),
token.type == TokenType.INTERVAL);
-
+
qtfr.greedy = token.getRepeatGreedy();
int ret = qtfr.setQuantifier(target, group, env, bytes, getBegin(), getEnd());
Node qn = qtfr;
-
+
if (token.getRepeatPossessive()) {
EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose
en.setTarget(qn);
@@ -923,7 +944,7 @@ class Parser extends Lexer {
} else if (ret == 2) { /* split case: /abc+/ */
target = ConsAltNode.newListNode(target, null);
ConsAltNode tmp = ((ConsAltNode)target).setCdr(ConsAltNode.newListNode(qn, null));
-
+
fetchToken();
return parseExpRepeatForCar(target, tmp, group);
}
@@ -941,11 +962,11 @@ class Parser extends Lexer {
QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(),
token.getRepeatUpper(),
token.type == TokenType.INTERVAL);
-
+
qtfr.greedy = token.getRepeatGreedy();
int ret = qtfr.setQuantifier(target.car, group, env, bytes, getBegin(), getEnd());
Node qn = qtfr;
-
+
if (token.getRepeatPossessive()) {
EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose
en.setTarget(qn);
@@ -961,7 +982,7 @@ class Parser extends Lexer {
fetchToken();
}
return top;
- }
+ }
private Node parseBranch(TokenType term) {
Node node = parseExp(term);
@@ -971,13 +992,13 @@ class Parser extends Lexer {
} else {
ConsAltNode top = ConsAltNode.newListNode(node, null);
ConsAltNode t = top;
-
+
while (token.type != TokenType.EOT && token.type != term && token.type != TokenType.ALT) {
node = parseExp(term);
if (node.getType() == NodeType.LIST) {
t.setCdr((ConsAltNode)node);
while (((ConsAltNode)node).cdr != null ) node = ((ConsAltNode)node).cdr;
-
+
t = ((ConsAltNode)node);
} else {
t.setCdr(ConsAltNode.newListNode(node, null));
@@ -987,7 +1008,7 @@ class Parser extends Lexer {
return top;
}
}
-
+
/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
private Node parseSubExp(TokenType term) {
Node node = parseBranch(term);
@@ -1000,11 +1021,11 @@ class Parser extends Lexer {
while (token.type == TokenType.ALT) {
fetchToken();
node = parseBranch(term);
-
+
t.setCdr(ConsAltNode.newAltNode(node, null));
t = t.cdr;
}
-
+
if (token.type != term) parseSubExpError(term);
return top;
} else {
@@ -1012,7 +1033,7 @@ class Parser extends Lexer {
return null; //not reached
}
}
-
+
private void parseSubExpError(TokenType term) {
if (term == TokenType.SUBEXP_CLOSE) {
newSyntaxException(ERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS);
@@ -1020,7 +1041,7 @@ class Parser extends Lexer {
newInternalException(ERR_PARSER_BUG);
}
}
-
+
private Node parseRegexp() {
fetchToken();
return parseSubExp(TokenType.EOT);
diff --git a/src/org/joni/ast/CClassNode.java b/src/org/joni/ast/CClassNode.java
index c05c9f3..86c82fb 100644
--- a/src/org/joni/ast/CClassNode.java
+++ b/src/org/joni/ast/CClassNode.java
@@ -22,8 +22,10 @@ package org.joni.ast;
import org.jcodings.CodeRange;
import org.jcodings.Encoding;
import org.jcodings.IntHolder;
+import org.jcodings.ascii.AsciiTables;
import org.jcodings.constants.CharacterType;
import org.jcodings.exception.EncodingException;
+import org.jcodings.specific.ASCIIEncoding;
import org.joni.BitSet;
import org.joni.CodeRangeBuffer;
import org.joni.Config;
@@ -326,8 +328,29 @@ public final class CClassNode extends Node {
public void addCType(int ctype, boolean not, ScanEnvironment env, IntHolder sbOut) {
Encoding enc = env.enc;
- int[]ranges = enc.ctypeCodeRange(ctype, sbOut);
+ if (Config.NON_UNICODE_SDW) {
+ switch(ctype) {
+ case CharacterType.D:
+ case CharacterType.S:
+ case CharacterType.W:
+ ctype ^= CharacterType.SPECIAL_MASK;
+ if (not) {
+ for (int c = 0; c < BitSet.SINGLE_BYTE_SIZE; c++) {
+ if (!ASCIIEncoding.INSTANCE.isCodeCType(c, ctype)) bs.set(c);
+ //if ((AsciiTables.AsciiCtypeTable[c] & (1 << ctype)) == 0) bs.set(c);
+ }
+ addAllMultiByteRange(enc);
+ } else {
+ for (int c = 0; c < BitSet.SINGLE_BYTE_SIZE; c++) {
+ if (ASCIIEncoding.INSTANCE.isCodeCType(c, ctype)) bs.set(c);
+ //if ((AsciiTables.AsciiCtypeTable[c] & (1 << ctype)) != 0) bs.set(c);
+ }
+ }
+ return;
+ }
+ }
+ int[]ranges = enc.ctypeCodeRange(ctype, sbOut);
if (ranges != null) {
addCTypeByRange(ctype, not, enc, sbOut.value, ranges);
return;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git
More information about the pkg-java-commits
mailing list