[jruby-joni] 87/223: More lexer refactoring.

Hideki Yamane henrich at moszumanska.debian.org
Mon Nov 16 11:21:53 UTC 2015


This is an automated email from the git hooks/post-receive script.

henrich pushed a commit to branch debian/sid
in repository jruby-joni.

commit 02b6ca0b9b601d2da4976cae5d38b9518c3ba47f
Author: Marcin Mielzynski <lopx at gazeta.pl>
Date:   Fri Feb 17 21:20:51 2012 +0100

    More lexer refactoring.
---
 src/org/joni/Lexer.java  | 493 ++++++++++++++++++++++++-----------------------
 src/org/joni/Parser.java |   2 +-
 2 files changed, 250 insertions(+), 245 deletions(-)

diff --git a/src/org/joni/Lexer.java b/src/org/joni/Lexer.java
index 5af6b6a..9ba85b7 100644
--- a/src/org/joni/Lexer.java
+++ b/src/org/joni/Lexer.java
@@ -801,6 +801,35 @@ class Lexer extends ScannerSupport {
         }
     }
 
+    private void fetchTokenFor_digit() {
+        unfetch();
+        int last = p;
+        int num = scanUnsignedNumber();
+        if (num < 0 || num > Config.MAX_BACKREF_NUM) {
+            // goto skip_backref
+        } else if (syntax.opDecimalBackref() && (num <= env.numMem || num <= 9)) { /* This spec. from GNU regex */
+            if (syntax.strictCheckBackref()) {
+                if (num > env.numMem || env.memNodes == null || env.memNodes[num] == null) newValueException(ERR_INVALID_BACKREF);
+            }
+            token.type = TokenType.BACKREF;
+            token.setBackrefNum(1);
+            token.setBackrefRef1(num);
+            token.setBackrefByName(false);
+            if (Config.USE_BACKREF_WITH_LEVEL) token.setBackrefExistLevel(false);
+            return;
+        }
+        // skip_backref:
+        if (c == '8' || c == '9') {
+            /* normal char */
+            p = last;
+            inc();
+            return;
+        }
+        p = last;
+        /* fall through */
+        fetchTokenFor_zero();
+    }
+
     private void fetchTokenFor_zero() {
         if (syntax.opEscOctal3()) {
             int last = p;
@@ -928,287 +957,263 @@ class Lexer extends ScannerSupport {
         }
     }
 
+    private void fetchTokenFor_metaChars() {
+        if (c == syntax.metaCharTable.anyChar) {
+            token.type = TokenType.ANYCHAR;
+        } else if (c == syntax.metaCharTable.anyTime) {
+            fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
+        }  else if (c == syntax.metaCharTable.zeroOrOneTime) {
+            fetchTokenFor_repeat(0, 1);
+        } else if (c == syntax.metaCharTable.oneOrMoreTime) {
+            fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
+        } else if (c == syntax.metaCharTable.anyCharAnyTime) {
+            token.type = TokenType.ANYCHAR_ANYTIME;
+            // goto out
+        }
+    }
+
     protected final TokenType fetchToken() {
-        int last;
         // mark(); // out
         start:
         while(true) {
+            if (!left()) {
+                token.type = TokenType.EOT;
+                return token.type;
+            }
 
-        if (!left()) {
-            token.type = TokenType.EOT;
-            return token.type;
-        }
-
-        token.type = TokenType.STRING;
-        token.base = 0;
-        token.backP = p;
-
-        fetch();
-
-        if (c == syntax.metaCharTable.esc && !syntax.op2IneffectiveEscape()) { // IS_MC_ESC_CODE(code, syn)
-            if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE);
-
+            token.type = TokenType.STRING;
+            token.base = 0;
             token.backP = p;
-            fetch();
-
-            token.setC(c);
-            token.escaped = true;
-            switch(c) {
 
-            case '*':
-                if (syntax.opEscAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
-                break;
-            case '+':
-                if (syntax.opEscPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
-                break;
-            case '?':
-                if (syntax.opEscQMarkZeroOne()) fetchTokenFor_repeat(0, 1);
-                break;
-            case '{':
-                if (syntax.opEscBraceInterval()) fetchTokenFor_openBrace();
-                break;
-            case '|':
-                if (syntax.opEscVBarAlt()) token.type = TokenType.ALT;
-                break;
-            case '(':
-                if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_OPEN;
-                break;
-            case ')':
-                if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE;
-                break;
-            case 'w':
-                if (syntax.opEscWWord()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
-                break;
-            case 'W':
-                if (syntax.opEscWWord()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
-                break;
-            case 'b':
-                if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.WORD_BOUND);
-                break;
-            case 'B':
-                if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.NOT_WORD_BOUND);
-                break;
-            case '<':
-                if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_BEGIN);
-                break;
-            case '>':
-                if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_END);
-                break;
-            case 's':
-                if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
-                break;
-            case 'S':
-                if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
-                break;
-            case 'd':
-                if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
-                break;
-            case 'D':
-                if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
-                break;
-            case 'h':
-                if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
-                break;
-            case 'H':
-                if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
-                break;
-            case 'A':
-                if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
-                break;
-            case 'Z':
-                if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.SEMI_END_BUF);
-                break;
-            case 'z':
-                if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF);
-                break;
-            case 'G':
-                if (syntax.opEscCapitalGBeginAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_POSITION);
-                break;
-            case '`':
-                if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
-                break;
-            case '\'':
-                if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF);
-                break;
-            case 'x':
-                fetchTokenFor_xBrace(); // extract to helper for all 'x'
-                break;
-            case 'u':
-                fetchTokenFor_uHex(); // extract to helper
-                break;
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-                unfetch();
-                last = p;
-                int num = scanUnsignedNumber();
-                if (num < 0 || num > Config.MAX_BACKREF_NUM) {
-                    // goto skip_backref
-                } else if (syntax.opDecimalBackref() && (num <= env.numMem || num <= 9)) { /* This spec. from GNU regex */
-                    if (syntax.strictCheckBackref()) {
-                        if (num > env.numMem || env.memNodes == null || env.memNodes[num] == null) newValueException(ERR_INVALID_BACKREF);
-                    }
-                    token.type = TokenType.BACKREF;
-                    token.setBackrefNum(1);
-                    token.setBackrefRef1(num);
-                    token.setBackrefByName(false);
-                    if (Config.USE_BACKREF_WITH_LEVEL) token.setBackrefExistLevel(false);
-                    break;
-                }
-                // skip_backref:
-                if (c == '8' || c == '9') {
-                    /* normal char */
-                    p = last;
-                    inc();
-                    break;
-                }
-                p = last;
-                /* fall through */
-            case '0':
-                fetchTokenFor_zero();
-                break;
-            case 'k':
-                if (Config.USE_NAMED_GROUP) fetchTokenFor_namedBackref();
-                break;
-            case 'g':
-                if (Config.USE_SUBEXP_CALL) fetchTokenFor_subexpCall();
-                break;
-            case 'Q':
-                if (syntax.op2EscCapitalQQuote()) token.type = TokenType.QUOTE_OPEN;
-                break;
-            case 'p':
-            case 'P':
-                fetchTokenFor_charProperty();
-                break;
-
-            default:
-                unfetch();
-                num = fetchEscapedValue();
-
-                /* set_raw: */
-                if (token.getC() != num) {
-                    token.type = TokenType.CODE_POINT;
-                    token.setCode(num);
-                } else { /* string */
-                    p = token.backP + enc.length(bytes, token.backP, stop);
-                }
-                break;
+            fetch();
 
-            } // switch (c)
+            if (c == syntax.metaCharTable.esc && !syntax.op2IneffectiveEscape()) { // IS_MC_ESC_CODE(code, syn)
+                if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE);
 
-        } else {
-            token.setC(c);
-            token.escaped = false;
-
-            // remove code duplication
-            if (Config.USE_VARIABLE_META_CHARS) {
-                if (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters()) {
-                    if (c == syntax.metaCharTable.anyChar) {
-                        token.type = TokenType.ANYCHAR;
-                    } else if (c == syntax.metaCharTable.anyTime) {
-                        fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
-                    }  else if (c == syntax.metaCharTable.zeroOrOneTime) {
-                        fetchTokenFor_repeat(0, 1);
-                    } else if (c == syntax.metaCharTable.oneOrMoreTime) {
-                        fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
-                    } else if (c == syntax.metaCharTable.anyCharAnyTime) {
-                        token.type = TokenType.ANYCHAR_ANYTIME;
-                        // goto out
-                    }
-                    break;
-                }
-            } // USE_VARIABLE_META_CHARS
+                token.backP = p;
+                fetch();
 
-            {
+                token.setC(c);
+                token.escaped = true;
                 switch(c) {
-                case '.':
-                    if (syntax.opDotAnyChar()) token.type = TokenType.ANYCHAR;
-                    break;
+
                 case '*':
-                    if (syntax.opAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
+                    if (syntax.opEscAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
                     break;
                 case '+':
-                    if (syntax.opPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
+                    if (syntax.opEscPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
                     break;
                 case '?':
-                    if (syntax.opQMarkZeroOne()) fetchTokenFor_repeat(0, 1);
+                    if (syntax.opEscQMarkZeroOne()) fetchTokenFor_repeat(0, 1);
                     break;
                 case '{':
-                    if (syntax.opBraceInterval()) fetchTokenFor_openBrace();
+                    if (syntax.opEscBraceInterval()) fetchTokenFor_openBrace();
                     break;
                 case '|':
-                    if (syntax.opVBarAlt()) token.type = TokenType.ALT;
+                    if (syntax.opEscVBarAlt()) token.type = TokenType.ALT;
                     break;
-
                 case '(':
-                    if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
-                        inc();
-                        if (peekIs('#')) {
-                            fetch();
-                            while (true) {
-                                if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP);
-                                fetch();
-                                if (c == syntax.metaCharTable.esc) {
-                                    if (left()) fetch();
-                                } else {
-                                    if (c == ')') break;
-                                }
-                            }
-                            continue start; // goto start
-                        }
-                        unfetch();
-                    }
-
-                    if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_OPEN;
+                    if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_OPEN;
                     break;
                 case ')':
-                    if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE;
+                    if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE;
                     break;
-                case '^':
-                    if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE);
+                case 'w':
+                    if (syntax.opEscWWord()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
                     break;
-                case '$':
-                    if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.SEMI_END_BUF : AnchorType.END_LINE);
+                case 'W':
+                    if (syntax.opEscWWord()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
                     break;
-                case '[':
-                    if (syntax.opBracketCC()) token.type = TokenType.CC_CC_OPEN;
+                case 'b':
+                    if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.WORD_BOUND);
                     break;
-                case ']':
-                    //if (*src > env->pattern)   /* /].../ is allowed. */
-                    //CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
+                case 'B':
+                    if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.NOT_WORD_BOUND);
                     break;
-                case '#':
-                    if (Option.isExtend(env.option)) {
-                        while (left()) {
-                            fetch();
-                            if (enc.isNewLine(c)) break;
-                        }
-                        continue start; // goto start
-                    }
+                case '<':
+                    if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_BEGIN);
+                    break;
+                case '>':
+                    if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_END);
+                    break;
+                case 's':
+                    if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
+                    break;
+                case 'S':
+                    if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
+                    break;
+                case 'd':
+                    if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
+                    break;
+                case 'D':
+                    if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
+                    break;
+                case 'h':
+                    if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
+                    break;
+                case 'H':
+                    if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
+                    break;
+                case 'A':
+                    if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
+                    break;
+                case 'Z':
+                    if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.SEMI_END_BUF);
+                    break;
+                case 'z':
+                    if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF);
+                    break;
+                case 'G':
+                    if (syntax.opEscCapitalGBeginAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_POSITION);
+                    break;
+                case '`':
+                    if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
+                    break;
+                case '\'':
+                    if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF);
+                    break;
+                case 'x':
+                    fetchTokenFor_xBrace();
+                    break;
+                case 'u':
+                    fetchTokenFor_uHex();
+                    break;
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                case '9':
+                    fetchTokenFor_digit();
+                    break;
+                case '0':
+                    fetchTokenFor_zero();
+                    break;
+                case 'k':
+                    if (Config.USE_NAMED_GROUP) fetchTokenFor_namedBackref();
+                    break;
+                case 'g':
+                    if (Config.USE_SUBEXP_CALL) fetchTokenFor_subexpCall();
+                    break;
+                case 'Q':
+                    if (syntax.op2EscCapitalQQuote()) token.type = TokenType.QUOTE_OPEN;
+                    break;
+                case 'p':
+                case 'P':
+                    fetchTokenFor_charProperty();
                     break;
 
-                case ' ':
-                case '\t':
-                case '\n':
-                case '\r':
-                case '\f':
-                    if (Option.isExtend(env.option)) continue start; // goto start
+                default:
+                    unfetch();
+                    int num = fetchEscapedValue();
+
+                    /* set_raw: */
+                    if (token.getC() != num) {
+                        token.type = TokenType.CODE_POINT;
+                        token.setCode(num);
+                    } else { /* string */
+                        p = token.backP + enc.length(bytes, token.backP, stop);
+                    }
                     break;
 
-                default: // string
+                } // switch (c)
+
+            } else {
+                token.setC(c);
+                token.escaped = false;
+
+                if (Config.USE_VARIABLE_META_CHARS && (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters())) {
+                    fetchTokenFor_metaChars();
                     break;
+                }
+
+                {
+                    switch(c) {
+                    case '.':
+                        if (syntax.opDotAnyChar()) token.type = TokenType.ANYCHAR;
+                        break;
+                    case '*':
+                        if (syntax.opAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
+                        break;
+                    case '+':
+                        if (syntax.opPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
+                        break;
+                    case '?':
+                        if (syntax.opQMarkZeroOne()) fetchTokenFor_repeat(0, 1);
+                        break;
+                    case '{':
+                        if (syntax.opBraceInterval()) fetchTokenFor_openBrace();
+                        break;
+                    case '|':
+                        if (syntax.opVBarAlt()) token.type = TokenType.ALT;
+                        break;
+
+                    case '(':
+                        if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
+                            inc();
+                            if (peekIs('#')) {
+                                fetch();
+                                while (true) {
+                                    if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP);
+                                    fetch();
+                                    if (c == syntax.metaCharTable.esc) {
+                                        if (left()) fetch();
+                                    } else {
+                                        if (c == ')') break;
+                                    }
+                                }
+                                continue start; // goto start
+                            }
+                            unfetch();
+                        }
 
-                } // switch
+                        if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_OPEN;
+                        break;
+                    case ')':
+                        if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE;
+                        break;
+                    case '^':
+                        if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE);
+                        break;
+                    case '$':
+                        if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.SEMI_END_BUF : AnchorType.END_LINE);
+                        break;
+                    case '[':
+                        if (syntax.opBracketCC()) token.type = TokenType.CC_CC_OPEN;
+                        break;
+                    case ']':
+                        //if (*src > env->pattern)   /* /].../ is allowed. */
+                        //CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
+                        break;
+                    case '#':
+                        if (Option.isExtend(env.option)) {
+                            while (left()) {
+                                fetch();
+                                if (enc.isNewLine(c)) break;
+                            }
+                            continue start; // goto start
+                        }
+                        break;
+
+                    case ' ':
+                    case '\t':
+                    case '\n':
+                    case '\r':
+                    case '\f':
+                        if (Option.isExtend(env.option)) continue start; // goto start
+                        break;
+
+                    default: // string
+                        break;
+
+                    } // switch
+                }
             }
-        }
 
-        break;
+            break;
         } // while
         return token.type;
     }
diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java
index ca34d5a..596bdd9 100644
--- a/src/org/joni/Parser.java
+++ b/src/org/joni/Parser.java
@@ -148,7 +148,7 @@ class Parser extends Lexer {
     private CClassNode parseCharClass() {
         fetchTokenInCC();
 
-        boolean neg;
+        final boolean neg;
         if (token.type == TokenType.CHAR && token.getC() == '^' && !token.escaped) {
             neg = true;
             fetchTokenInCC();

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git



More information about the pkg-java-commits mailing list