[jruby-joni] 19/194: grapheme clusters

Hideki Yamane henrich at moszumanska.debian.org
Thu Feb 1 12:04:09 UTC 2018


This is an automated email from the git hooks/post-receive script.

henrich pushed a commit to branch debian/sid
in repository jruby-joni.

commit b00c05c829633f364a54da3ca2e4c3f189aca1fa
Author: Marcin Mielzynski <lopx at gazeta.pl>
Date:   Tue Dec 19 20:26:28 2017 +0100

    grapheme clusters
---
 src/org/joni/Parser.java | 384 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 358 insertions(+), 26 deletions(-)

diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java
index ffe80c6..cbd8267 100644
--- a/src/org/joni/Parser.java
+++ b/src/org/joni/Parser.java
@@ -24,10 +24,10 @@ import static org.joni.BitStatus.bsOnOff;
 import static org.joni.Option.isDontCaptureGroup;
 import static org.joni.Option.isIgnoreCase;
 
+import org.jcodings.Encoding;
 import org.jcodings.Ptr;
 import org.jcodings.constants.CharacterType;
 import org.jcodings.constants.PosixBracket;
-import org.jcodings.unicode.UnicodeEncoding;
 import org.joni.ast.AnchorNode;
 import org.joni.ast.AnyCharNode;
 import org.joni.ast.BackRefNode;
@@ -760,7 +760,7 @@ class Parser extends Lexer {
             break;
 
         case EXTENDED_GRAPHEME_CLUSTER:
-            node = parseExtendedGraphemeCluster(node);
+            node = parseExtendedGraphemeCluster();
             break;
 
         case KEEP:
@@ -862,33 +862,365 @@ class Parser extends Lexer {
         return en;
     }
 
-    private Node parseExtendedGraphemeCluster(Node node) {
-        if (Config.USE_UNICODE_PROPERTIES) {
-            if (enc.isUnicode()) {
-                int ctype = enc.propertyNameToCType(new byte[]{(byte)'M'}, 0, 1);
-                if (ctype > 0) {
-                    CClassNode cc1 = new CClassNode(); /* \P{M} */
-                    cc1.addCType(ctype, false, env, this);
-                    cc1.setNot();
-                    CClassNode cc2 = new CClassNode(); /* \p{M}* */
-                    cc1.addCType(ctype, false, env, this);
-                    QuantifierNode qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
-                    qn.setTarget(cc2);
-                    /* (?>...) */
-                    EncloseNode en2 = new EncloseNode(EncloseType.STOP_BACKTRACK);
-                    /* \P{M}\p{M}* */
-                    en2.setTarget(ConsAltNode.newListNode(cc1, ConsAltNode.newListNode(qn, null)));
-                    node = en2;
-                }
+    private static class GraphemeNames {
+        static final byte[]Grapheme_Cluster_Break_Extend = "graphemeclusterbreak=extend".getBytes();
+        static final byte[]Grapheme_Cluster_Break_SpacingMark = "graphemeclusterbreak=spacingmark".getBytes();
+        static final byte[]Grapheme_Cluster_Break_Control = "graphemeclusterbreak=control".getBytes();
+        static final byte[]Grapheme_Cluster_Break_T = "graphemeclusterbreak=t".getBytes();
+        static final byte[]Grapheme_Cluster_Break_L = "graphemeclusterbreak=l".getBytes();
+        static final byte[]Grapheme_Cluster_Break_LVT = "graphemeclusterbreak=lvt".getBytes();
+        static final byte[]Grapheme_Cluster_Break_V = "graphemeclusterbreak=v".getBytes();
+        static final byte[]Grapheme_Cluster_Break_LV = "graphemeclusterbreak=lv".getBytes();
+        static final byte[]Grapheme_Cluster_Break_E_Modifier = "graphemeclusterbreak=emodifier".getBytes();
+        static final byte[]Grapheme_Cluster_Break_E_Base = "graphemeclusterbreak=ebase".getBytes();
+        static final byte[]Grapheme_Cluster_Break_E_Base_GAZ = "graphemeclusterbreak=ebasegaz".getBytes();
+        static final byte[]Grapheme_Cluster_Break_Glue_After_Zwj = "graphemeclusterbreak=glueafterzwj".getBytes();
+        static final byte[]Grapheme_Cluster_Break_Prepend = "graphemeclusterbreak=prepend".getBytes();
+
+
+        static final int Glue_After_Zwj_Ranges[] = new int[] {
+            13,
+            0x1F308, 0x1F308,
+            0x1F33E, 0x1F33E,
+            0x1F373, 0x1F373,
+            0x1F393, 0x1F393,
+            0x1F3A4, 0x1F3A4,
+            0x1F3A8, 0x1F3A8,
+            0x1F3EB, 0x1F3EB,
+            0x1F3ED, 0x1F3ED,
+            0x1F4BB, 0x1F4BC,
+            0x1F527, 0x1F527,
+            0x1F52C, 0x1F52C,
+            0x1F680, 0x1F680,
+            0x1F692, 0x1F692,
+        };
+
+        static final int Emoji_Ranges[] = new int[] {
+            4,
+            0x2640, 0x2640,
+            0x2642, 0x2642,
+            0x2695, 0x2696,
+            0x2708, 0x2708,
+        };
+
+        static final int E_Base_Ranges[] = new int[] {
+            8,
+            0x1F3C2, 0x1F3C2,
+            0x1F3C7, 0x1F3C7,
+            0x1F3CC, 0x1F3CC,
+            0x1F3F3, 0x1F3F3,
+            0x1F441, 0x1F441,
+            0x1F46F, 0x1F46F,
+            0x1F574, 0x1F574,
+            0x1F6CC, 0x1F6CC,
+        };
+
+        static int nameToCtype(Encoding enc, byte[]name) {
+            return enc.propertyNameToCType(name, 0, name.length);
+        }
+    }
+
+    private Node parseExtendedGraphemeCluster() {
+        ConsAltNode alt;
+        if (Config.USE_UNICODE_PROPERTIES && enc.isUnicode()) {
+            int sbOut = enc.minLength() > 1 ? 0x00 : 0x80;
+            int extend = GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Extend);
+            CClassNode cc = new CClassNode();
+            cc.addCType(extend, false, env, this);
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_SpacingMark), false, env, this);
+            cc.addCodeRange(env, 0x200D, 0x200D);
+            QuantifierNode qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            ConsAltNode list = ConsAltNode.newListNode(qn, null);
+
+            /* ( RI-sequence | Hangul-Syllable | !Control ) */
+
+            /* !Control */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Control), true, env, this);
+            if (enc.minLength() > 1) {
+                CodeRangeBuffer buff = new CodeRangeBuffer();
+                buff = CodeRangeBuffer.addCodeRange(buff, env, 0x0a, 0x0a);
+                buff = CodeRangeBuffer.addCodeRange(buff, env, 0x0d, 0x0d);
+                cc.mbuf = CodeRangeBuffer.andCodeRangeBuff(cc.mbuf, false, buff, true);
+            } else {
+                cc.bs.clear(0x0a);
+                cc.bs.clear(0x0d);
             }
+
+            alt = ConsAltNode.newAltNode(cc, null);
+
+            /* Hangul-Syllable
+             *  := L* V+ T*
+             *  | L* LV V* T*
+             *  | L* LVT T*
+             *  | L+
+             *  | T+ */
+
+            /* T+ */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+            qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            alt = ConsAltNode.newAltNode(qn, alt);
+
+            /* L+ */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+            qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            alt = ConsAltNode.newAltNode(qn, alt);
+
+            /* L* LVT T* */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+
+            ConsAltNode list2;
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_LVT), false, env, this);
+            list2 = ConsAltNode.newListNode(cc, list2);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            alt = ConsAltNode.newAltNode(list2, alt);
+
+            /* L* LV V* T* */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_V), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_LV), false, env, this);
+            list2 = ConsAltNode.newListNode(cc, list2);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            alt = ConsAltNode.newAltNode(list2, alt);
+
+            /* L* V+ T* */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_T), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_V), false, env, this);
+            qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_L), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            alt = ConsAltNode.newAltNode(list2, alt);
+
+            /* Emoji sequence := (E_Base | EBG) Extend* E_Modifier?
+             *                   (ZWJ (Glue_After_Zwj | EBG Extend* E_Modifier?) )* */
+
+            /* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, env, this);
+            qn = new QuantifierNode(0, 1, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCType(extend, false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, env, this);
+            list2 = ConsAltNode.newListNode(cc, list2);
+
+            ConsAltNode alt2 = ConsAltNode.newAltNode(list2, null);
+
+            /* Glue_After_Zwj */
+            cc = new CClassNode();
+            cc.addCType(extend, false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCTypeByRange(-1, false, enc, sbOut, GraphemeNames.Glue_After_Zwj_Ranges);
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Glue_After_Zwj), false, env, this);
+            list2 = ConsAltNode.newListNode(cc, list2);
+
+            alt2 = ConsAltNode.newAltNode(list2, alt2);
+
+            /* Emoji variation sequence
+             * http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt
+             */
+
+            StringNode str = new StringNode();
+            str.catCode(0xfe0f, enc);
+            str.setRaw();
+            qn = new QuantifierNode(0, 1, false);
+            qn.setTarget(str);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCTypeByRange(-1, false, enc, sbOut, GraphemeNames.Emoji_Ranges);
+            list2 = ConsAltNode.newListNode(cc, list2);
+
+            alt2 = ConsAltNode.newAltNode(list2, alt2);
+
+            list2 = ConsAltNode.newListNode(alt2, null);
+
+            /* ZWJ */
+            str = new StringNode();
+            str.catCode(0x200D, enc);
+            str.setRaw();
+            list2 = ConsAltNode.newListNode(str, list2);
+
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(list2);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            /* E_Modifier? */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, env, this);
+            qn = new QuantifierNode(0, 1, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            /* Extend* */
+            cc = new CClassNode();
+            cc.addCType(extend, false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            /* (E_Base | EBG) */
+            cc = new CClassNode();
+            cc.addCTypeByRange(-1, false, enc, sbOut, GraphemeNames.E_Base_Ranges);
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base), false, env, this);
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, env, this);
+            list2 = ConsAltNode.newListNode(cc, list2);
+
+            alt = ConsAltNode.newAltNode(list2, alt);
+
+            /* ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
+            /* a sequence starting with ZWJ seems artificial, but GraphemeBreakTest
+             * has such examples.
+             * http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html
+             */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Modifier), false, env, this);
+            qn = new QuantifierNode(0, 1, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Glue_After_Zwj), false, env, this);
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_E_Base_GAZ), false, env, this);
+            list2 = ConsAltNode.newListNode(cc, list2);
+
+            str = new StringNode();
+            str.catCode(0x200D, enc);
+            str.setRaw();
+            list2 = ConsAltNode.newListNode(str, list2);
+
+            alt = ConsAltNode.newAltNode(list2, alt);
+
+            /* RI-Sequence := Regional_Indicator{2} */
+            cc = new CClassNode();
+            cc.addCodeRange(env, 0x1F1E6, 0x1F1FF);
+            qn = new QuantifierNode(2, 2, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            alt = ConsAltNode.newAltNode(list2, alt);
+
+            list = ConsAltNode.newListNode(alt, list);
+
+            /* Prepend* */
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Prepend), false, env, this);
+            qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list = ConsAltNode.newListNode(qn, list);
+
+            /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
+            AnyCharNode any = new AnyCharNode();
+            int option = bsOnOff(env.option, Option.MULTILINE, false);
+            EncloseNode enclose = new EncloseNode(option, 0);
+            enclose.setTarget(any);
+
+            alt = ConsAltNode.newAltNode(enclose, null);
+
+            /* Prepend+ */
+            str = new StringNode();
+            str.catCode(0x200D, enc);
+            str.setRaw();
+            qn = new QuantifierNode(0, 1, false);
+            qn.setTarget(str);
+            list2 = ConsAltNode.newListNode(qn, null);
+
+            cc = new CClassNode();
+            cc.addCType(GraphemeNames.nameToCtype(enc, GraphemeNames.Grapheme_Cluster_Break_Prepend), false, env, this);
+            qn = new QuantifierNode(1, QuantifierNode.REPEAT_INFINITE, false);
+            qn.setTarget(cc);
+            list2 = ConsAltNode.newListNode(qn, list2);
+
+            alt = ConsAltNode.newAltNode(list2, alt);
+
+            alt = ConsAltNode.newAltNode(list, alt);
+        } else {
+            /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
+            AnyCharNode any = new AnyCharNode();
+            int option = bsOnOff(env.option, Option.MULTILINE, false);
+            EncloseNode enclose = new EncloseNode(option, 0);
+            enclose.setTarget(any);
+            alt = ConsAltNode.newAltNode(enclose, null);
         }
-        if (node == null) {
-            AnyCharNode np1 = new AnyCharNode();
-            EncloseNode on = new EncloseNode(bsOnOff(env.option, Option.MULTILINE, false), 0);
-            on.setTarget(np1);
-            node = np1;
+
+        /* \x0D\x0A */
+        StringNode str = new StringNode();
+        str.catCode(0x0D, enc);
+        str.catCode(0x0A, enc);
+        str.setRaw();
+        alt = ConsAltNode.newAltNode(str, alt);
+
+        /* (?>\x0D\x0A|...) */
+        EncloseNode enclose = new EncloseNode(EncloseNode.STOP_BACKTRACK);
+        enclose.setTarget(alt);
+
+        if (Config.USE_UNICODE_PROPERTIES && enc.isUnicode()) {
+            int option = bsOnOff(env.option, Option.IGNORECASE, true);
+            EncloseNode enc = new EncloseNode(option, 0);
+            enc.setTarget(enclose);
+            return enc;
+        } else {
+            return enclose;
         }
-        return node;
     }
 
     private Node parseExpTkByte(boolean group) {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git



More information about the pkg-java-commits mailing list