[Python-modules-commits] [python-regex] 01/04: Import python-regex_0.1.20170117.orig.tar.gz

Sandro Tosi morph at moszumanska.debian.org
Tue Jan 24 00:39:01 UTC 2017


This is an automated email from the git hooks/post-receive script.

morph pushed a commit to branch master
in repository python-regex.

commit cafa5a91ebc48cf82e557af185ed97774d4732fd
Author: Sandro Tosi <morph at debian.org>
Date:   Mon Jan 23 19:35:20 2017 -0500

    Import python-regex_0.1.20170117.orig.tar.gz
---
 PKG-INFO               |   2 +-
 Python2/_regex.c       | 101 +++++++++++++++++++++++++++--------
 Python2/_regex_core.py | 139 ++++++++++++++++++++++++++++++++++++-------------
 Python2/regex.py       |   2 +-
 Python2/test_regex.py  |   6 +++
 Python3/_regex.c       | 101 +++++++++++++++++++++++++++--------
 Python3/_regex_core.py | 139 ++++++++++++++++++++++++++++++++++++-------------
 Python3/regex.py       |   2 +-
 Python3/test_regex.py  |   6 +++
 setup.py               |   2 +-
 10 files changed, 380 insertions(+), 120 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index 774bcd5..db1dac0 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: regex
-Version: 2016.12.27
+Version: 2017.01.17
 Summary: Alternative regular expression module, to replace re.
 Home-page: https://bitbucket.org/mrabarnett/mrab-regex
 Author: Matthew Barnett
diff --git a/Python2/_regex.c b/Python2/_regex.c
index bd77820..1fed68d 100644
--- a/Python2/_regex.c
+++ b/Python2/_regex.c
@@ -653,6 +653,7 @@ typedef struct PatternObject {
     RE_Node* start_test;
     size_t true_group_count; /* The true number of capture groups. */
     size_t public_group_count; /* The number of public capture groups. */
+    size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
     size_t repeat_count; /* The number of repeats. */
     Py_ssize_t group_end_index; /* The number of group closures. */
     PyObject* groupindex;
@@ -750,6 +751,7 @@ typedef struct RE_CompileArgs {
     RE_Node* start; /* The start node. */
     RE_Node* end; /* The end node. */
     size_t repeat_depth; /* The nesting depth of the repeat. */
+    size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
     BOOL forward; /* Whether it's a forward (not reverse) pattern. */
     BOOL visible_captures; /* Whether all of the captures will be visible. */
     BOOL has_captures; /* Whether the pattern has capture groups. */
@@ -757,6 +759,7 @@ typedef struct RE_CompileArgs {
     BOOL within_fuzzy; /* Whether the subpattern is within a fuzzy section. */
     BOOL has_groups; /* Whether the subpattern contains captures. */
     BOOL has_repeats; /* Whether the subpattern contains repeats. */
+    BOOL in_define; /* Whether we're in (?(DEFINE)...). */
 } RE_CompileArgs;
 
 /* The string slices which will be concatenated to make the result string of
@@ -2421,6 +2424,31 @@ Py_LOCAL_INLINE(BOOL) same_char_ign(RE_EncodingTable* encoding, RE_LocaleInfo*
     return FALSE;
 }
 
+/* Checks whether 2 characters are the same, ignoring case. The first character
+ * is already case-folded or is a possible Turkic 'I'.
+ */
+Py_LOCAL_INLINE(BOOL) same_char_ign_turkic(RE_EncodingTable* encoding,
+  RE_LocaleInfo* locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
+    int count;
+    Py_UCS4 cases[RE_MAX_CASES];
+    int i;
+
+    if (ch1 == ch2)
+        return TRUE;
+
+    if (!encoding->possible_turkic(locale_info, ch1))
+        return FALSE;
+
+    count = encoding->all_turkic_i(locale_info, ch1, cases);
+
+    for (i = 1; i < count; i++) {
+        if (cases[i] == ch2)
+            return TRUE;
+    }
+
+    return FALSE;
+}
+
 /* Wrapper for calling 'same_char' via a pointer. */
 static BOOL same_char_ign_wrapper(RE_EncodingTable* encoding, RE_LocaleInfo*
   locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
@@ -6221,7 +6249,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
     Py_ssize_t length;
     Py_ssize_t s_pos;
     Py_UCS4 folded[RE_MAX_FOLDED];
-
     state = safe_state->re_state;
     encoding = state->encoding;
     locale_info = state->locale_info;
@@ -6256,7 +6283,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
             f_pos = 0;
         }
 
-        if (s_pos < length && same_char_ign(encoding, locale_info,
+        if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
           values[s_pos], folded[f_pos])) {
             ++s_pos;
             ++f_pos;
@@ -6297,7 +6324,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
     Py_ssize_t length;
     Py_ssize_t s_pos;
     Py_UCS4 folded[RE_MAX_FOLDED];
-
     state = safe_state->re_state;
     encoding = state->encoding;
     locale_info = state->locale_info;
@@ -6331,7 +6357,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
             f_pos = 0;
         }
 
-        if (s_pos < length && same_char_ign(encoding, locale_info,
+        if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
           values[length - s_pos - 1], folded[folded_len - f_pos - 1])) {
             ++s_pos;
             ++f_pos;
@@ -7032,8 +7058,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD(RE_State* state, RE_NextNode* next,
             f_pos = 0;
         }
 
-        if (!same_char_ign(encoding, locale_info, folded[f_pos],
-          values[s_pos]))
+        if (!same_char_ign(encoding, locale_info, values[s_pos],
+          folded[f_pos]))
             return RE_ERROR_FAILURE;
 
         ++s_pos;
@@ -7105,8 +7131,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD_REV(RE_State* state, RE_NextNode*
             f_pos = 0;
         }
 
-        if (!same_char_ign(encoding, locale_info, folded[folded_len - f_pos -
-          1], values[length - s_pos - 1]))
+        if (!same_char_ign(encoding, locale_info, values[length - s_pos - 1],
+          folded[folded_len - f_pos - 1]))
             return RE_ERROR_FAILURE;
 
         ++s_pos;
@@ -13522,8 +13548,8 @@ advance:
 
                 if (folded_pos < folded_len && same_char_ign(encoding,
                   locale_info,
-                   folded[folded_pos],
-                   gfolded[gfolded_pos])) {
+                   gfolded[gfolded_pos],
+                   folded[folded_pos])) {
                     ++folded_pos;
                     ++gfolded_pos;
                 } else if (node->status & RE_STATUS_FUZZY) {
@@ -13626,8 +13652,8 @@ advance:
                 }
 
                 if (folded_pos > 0 && same_char_ign(encoding, locale_info,
-                   folded[folded_pos - 1],
-                   gfolded[gfolded_pos - 1])) {
+                   gfolded[gfolded_pos - 1],
+                   folded[folded_pos - 1])) {
                     --folded_pos;
                     --gfolded_pos;
                 } else if (node->status & RE_STATUS_FUZZY) {
@@ -14194,7 +14220,7 @@ advance:
                     }
 
                     if (folded_pos < folded_len && same_char_ign(encoding,
-                      locale_info, folded[folded_pos], values[string_pos])) {
+                      locale_info, values[string_pos], folded[folded_pos])) {
                         ++string_pos;
                         ++folded_pos;
 
@@ -14306,7 +14332,7 @@ advance:
                     }
 
                     if (folded_pos > 0 && same_char_ign(encoding, locale_info,
-                      folded[folded_pos - 1], values[string_pos - 1])) {
+                      values[string_pos - 1], folded[folded_pos - 1])) {
                         --string_pos;
                         --folded_pos;
 
@@ -15540,6 +15566,7 @@ backtrack:
             RE_Node* repeated;
             RE_Node* test;
             BOOL match;
+            Py_ssize_t skip_pos;
             BOOL m;
             size_t index;
             TRACE(("%s\n", re_op_text[bt_data->op]))
@@ -15569,7 +15596,7 @@ backtrack:
             index = node->values[0];
 
             match = FALSE;
-
+            skip_pos = -1;
             if (test->status & RE_STATUS_FUZZY) {
                 for (;;) {
                     RE_Position next_position;
@@ -15866,6 +15893,7 @@ backtrack:
                         if (!is_repeat_guarded(safe_state, index, pos,
                           RE_STATUS_TAIL)) {
                             match = TRUE;
+                            skip_pos = new_pos;
                             break;
                         }
                     }
@@ -15923,6 +15951,7 @@ backtrack:
                         if (!is_repeat_guarded(safe_state, index, pos,
                           RE_STATUS_TAIL)) {
                             match = TRUE;
+                            skip_pos = new_pos;
                             break;
                         }
                     }
@@ -16165,6 +16194,12 @@ backtrack:
                 }
 
                 node = node->next_1.node;
+
+                if (skip_pos >= 0) {
+                    state->text_pos = skip_pos;
+                    node = node->next_1.node;
+                }
+
                 goto advance;
             } else {
                 /* The tail couldn't match. */
@@ -21238,7 +21273,7 @@ static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject*
             break;
 
         /* Don't bother to build a MatchObject. */
-        switch (self->public_group_count) {
+        switch (self->visible_capture_count) {
         case 0:
             if (state.reverse) {
                 b = state.text_pos;
@@ -22981,6 +23016,7 @@ Py_LOCAL_INLINE(int) build_FUZZY(RE_CompileArgs* args) {
     args->is_fuzzy = TRUE;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
     ++args->code;
 
@@ -23021,15 +23057,16 @@ Py_LOCAL_INLINE(int) build_ATOMIC(RE_CompileArgs* args) {
         return RE_ERROR_ILLEGAL;
 
     args->code = subargs.code;
-    ++args->code;
-
-    /* Check the subpattern. */
     args->min_width += subargs.min_width;
     args->has_captures |= subargs.has_captures;
     args->is_fuzzy |= subargs.is_fuzzy;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
+    ++args->code;
+
+    /* Check the subpattern. */
     if (subargs.has_groups)
         atomic_node->status |= RE_STATUS_HAS_GROUPS;
 
@@ -23142,6 +23179,7 @@ Py_LOCAL_INLINE(int) build_BRANCH(RE_CompileArgs* args) {
         return RE_ERROR_ILLEGAL;
 
     args->code = subargs.code;
+    args->visible_capture_count = subargs.visible_capture_count;
 
     ++args->code;
     args->min_width += min_width;
@@ -23190,6 +23228,7 @@ Py_LOCAL_INLINE(int) build_CALL_REF(RE_CompileArgs* args) {
     args->is_fuzzy |= subargs.is_fuzzy;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
     ++args->code;
 
@@ -23286,14 +23325,15 @@ Py_LOCAL_INLINE(int) build_CONDITIONAL(RE_CompileArgs* args) {
         return RE_ERROR_ILLEGAL;
 
     args->code = subargs.code;
-    ++args->code;
-
-    /* Check the lookaround subpattern. */
     args->has_captures |= subargs.has_captures;
     args->is_fuzzy |= subargs.is_fuzzy;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
+    ++args->code;
+
+    /* Check the lookaround subpattern. */
     if (subargs.has_groups)
         test_node->status |= RE_STATUS_HAS_GROUPS;
 
@@ -23321,6 +23361,7 @@ Py_LOCAL_INLINE(int) build_CONDITIONAL(RE_CompileArgs* args) {
     args->is_fuzzy |= subargs.is_fuzzy;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
     min_width = subargs.min_width;
 
@@ -23349,6 +23390,7 @@ Py_LOCAL_INLINE(int) build_CONDITIONAL(RE_CompileArgs* args) {
         args->is_fuzzy |= subargs.is_fuzzy;
         args->has_groups |= subargs.has_groups;
         args->has_repeats |= subargs.has_repeats;
+        args->visible_capture_count = subargs.visible_capture_count;
 
         min_width = min_ssize_t(min_width, subargs.min_width);
 
@@ -23427,6 +23469,10 @@ Py_LOCAL_INLINE(int) build_GROUP(RE_CompileArgs* args) {
     args->is_fuzzy |= subargs.is_fuzzy;
     args->has_groups |= TRUE;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
+
+    if (!args->in_define)
+        ++args->visible_capture_count;
 
     ++args->code;
 
@@ -23508,6 +23554,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
     start_node->values[0] = group;
 
     subargs = *args;
+    subargs.in_define = TRUE;
     status = build_sequence(&subargs);
     if (status != RE_ERROR_SUCCESS)
         return status;
@@ -23517,6 +23564,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
     args->is_fuzzy |= subargs.is_fuzzy;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
     min_width = subargs.min_width;
 
@@ -23540,6 +23588,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
         args->code = subargs.code;
         args->has_captures |= subargs.has_captures;
         args->is_fuzzy |= subargs.is_fuzzy;
+        args->visible_capture_count = subargs.visible_capture_count;
 
         if (group == 0) {
             /* Join the 2 branches end-to-end and bypass it. The sequence
@@ -23552,6 +23601,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
         } else {
             args->has_groups |= subargs.has_groups;
             args->has_repeats |= subargs.has_repeats;
+            args->visible_capture_count = subargs.visible_capture_count;
 
             min_width = min_ssize_t(min_width, subargs.min_width);
 
@@ -23624,6 +23674,7 @@ Py_LOCAL_INLINE(int) build_LOOKAROUND(RE_CompileArgs* args) {
     args->is_fuzzy |= subargs.is_fuzzy;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
     if (subargs.has_groups)
         lookaround_node->status |= RE_STATUS_HAS_GROUPS;
@@ -23760,6 +23811,7 @@ Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) {
         args->is_fuzzy |= subargs.is_fuzzy;
         args->has_groups |= subargs.has_groups;
         args->has_repeats |= subargs.has_repeats;
+        args->visible_capture_count = subargs.visible_capture_count;
 
         ++args->code;
 
@@ -23805,6 +23857,7 @@ Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) {
         args->is_fuzzy |= subargs.is_fuzzy;
         args->has_groups |= subargs.has_groups;
         args->has_repeats = TRUE;
+        args->visible_capture_count = subargs.visible_capture_count;
 
         ++args->code;
 
@@ -24303,6 +24356,8 @@ Py_LOCAL_INLINE(BOOL) compile_to_nodes(RE_CODE* code, RE_CODE* end_code,
     args.repeat_depth = 0;
     args.is_fuzzy = FALSE;
     args.within_fuzzy = FALSE;
+    args.visible_capture_count = 0;
+    args.in_define = FALSE;
     status = build_sequence(&args);
     if (status == RE_ERROR_ILLEGAL)
         set_error(RE_ERROR_ILLEGAL, NULL);
@@ -24314,6 +24369,7 @@ Py_LOCAL_INLINE(BOOL) compile_to_nodes(RE_CODE* code, RE_CODE* end_code,
     pattern->is_fuzzy = args.is_fuzzy;
     pattern->do_search_start = TRUE;
     pattern->start_node = args.start;
+    pattern->visible_capture_count = args.visible_capture_count;
 
     /* Optimise the pattern. */
     if (!optimise_pattern(pattern))
@@ -24548,6 +24604,7 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
     self->repeat_count = 0;
     self->true_group_count = 0;
     self->public_group_count = public_group_count;
+    self->visible_capture_count = 0;
     self->group_end_index = 0;
     self->groupindex = groupindex;
     self->indexgroup = indexgroup;
diff --git a/Python2/_regex_core.py b/Python2/_regex_core.py
index 9a79e08..b101340 100644
--- a/Python2/_regex_core.py
+++ b/Python2/_regex_core.py
@@ -286,10 +286,14 @@ def _fold_case(info, string):
 
     return _regex.fold_case(flags, string)
 
-def is_cased(info, char):
+def is_cased_i(info, char):
     "Checks whether a character is cased."
     return len(_regex.get_all_cases(info.flags, char)) > 1
 
+def is_cased_f(flags, char):
+    "Checks whether a character is cased."
+    return len(_regex.get_all_cases(flags, char)) > 1
+
 def _compile_firstset(info, fs):
     "Compiles the firstset for the pattern."
     reverse = bool(info.flags & REVERSE)
@@ -314,7 +318,7 @@ def _check_firstset(info, reverse, fs):
 
 #        if i.case_flags:
 #            if isinstance(i, Character):
-#                if is_cased(info, i.value):
+#                if is_cased_i(info, i.value):
 #                    return []
 #            elif isinstance(i, SetBase):
 #                return []
@@ -1891,9 +1895,6 @@ class RegexBase(object):
     def compile(self, reverse=False, fuzzy=False):
         return self._compile(reverse, fuzzy)
 
-    def dump(self, indent, reverse):
-        self._dump(indent, reverse)
-
     def is_empty(self):
         return False
 
@@ -1930,7 +1931,7 @@ class ZeroWidthBase(RegexBase):
             flags |= REVERSE_OP
         return [(self._opcode, flags)]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%s%s %s" % (INDENT * indent, self._op_name,
           POS_TEXT[self.positive])
 
@@ -1950,7 +1951,7 @@ class Any(RegexBase):
             flags |= FUZZY_OP
         return [(self._opcode[reverse], flags)]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%s%s" % (INDENT * indent, self._op_name)
 
     def max_width(self):
@@ -2003,7 +2004,7 @@ class Atomic(RegexBase):
         return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
           [(OP.END, )])
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sATOMIC" % (INDENT * indent)
         self.subpattern.dump(indent + 1, reverse)
 
@@ -2114,7 +2115,7 @@ class Branch(RegexBase):
 
         return code
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sBRANCH" % (INDENT * indent)
         self.branches[0].dump(indent + 1, reverse)
         for b in self.branches[1 : ]:
@@ -2444,7 +2445,7 @@ class CallGroup(RegexBase):
     def _compile(self, reverse, fuzzy):
         return [(OP.GROUP_CALL, self.call_ref)]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sGROUP_CALL %s" % (INDENT * indent, self.group)
 
     def __eq__(self, other):
@@ -2517,7 +2518,7 @@ class Character(RegexBase):
 
         return code.compile(reverse, fuzzy)
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         display = repr(unichr(self.value)).lstrip("bu")
         print "%sCHARACTER %s %s%s" % (INDENT * indent,
           POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])
@@ -2605,7 +2606,7 @@ class Conditional(RegexBase):
 
         return code
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group)
         self.yes_item.dump(indent + 1, reverse)
         if not self.no_item.is_empty():
@@ -2740,7 +2741,7 @@ class Fuzzy(RegexBase):
         return ([(OP.FUZZY, flags) + tuple(arguments)] +
           self.subpattern.compile(reverse, True) + [(OP.END,)])
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         constraints = self._constraints_to_string()
         if constraints:
             constraints = " " + constraints
@@ -2799,7 +2800,7 @@ class Grapheme(RegexBase):
 
         return grapheme_matcher.compile(reverse, fuzzy)
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sGRAPHEME" % (INDENT * indent)
 
     def max_width(self):
@@ -2864,7 +2865,7 @@ class GreedyRepeat(RegexBase):
 
         return ([tuple(repeat)] + subpattern + [(OP.END, )])
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         if self.max_count is None:
             limit = "INF"
         else:
@@ -2962,7 +2963,7 @@ class Group(RegexBase):
 
         return code
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         group = self.group
         if group < 0:
             group = private_groups[group]
@@ -3026,7 +3027,7 @@ class LookAround(RegexBase):
         return ([(OP.LOOKAROUND, int(self.positive), int(not self.behind))] +
           self.subpattern.compile(self.behind) + [(OP.END, )])
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind],
           POS_TEXT[self.positive])
         self.subpattern.dump(indent + 1, self.behind)
@@ -3106,7 +3107,7 @@ class LookAroundConditional(RegexBase):
 
         return code
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print("%sCONDITIONAL %s %s" % (INDENT * indent,
           self._dir_text[self.behind], POS_TEXT[self.positive]))
         self.subpattern.dump(indent + 1, self.behind)
@@ -3177,7 +3178,7 @@ class Property(RegexBase):
             flags |= FUZZY_OP
         return [(self._opcode[self.case_flags, reverse], flags, self.value)]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         prop = PROPERTY_NAMES[self.value >> 16]
         name, value = prop[0], prop[1][self.value & 0xFFFF]
         print "%sPROPERTY %s %s:%s%s" % (INDENT * indent,
@@ -3259,7 +3260,7 @@ class Range(RegexBase):
         return [(self._opcode[self.case_flags, reverse], flags, self.lower,
           self.upper)]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         display_lower = repr(unichr(self.lower)).lstrip("bu")
         display_upper = repr(unichr(self.upper)).lstrip("bu")
         print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive],
@@ -3310,7 +3311,7 @@ class RefGroup(RegexBase):
             flags |= FUZZY_OP
         return [(self._opcode[self.case_flags, reverse], flags, self.group)]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sREF_GROUP %s%s" % (INDENT * indent, self.group,
           CASE_TEXT[self.case_flags])
 
@@ -3355,7 +3356,7 @@ class Sequence(RegexBase):
                 if s.case_flags != case_flags:
                     # Different case sensitivity, so flush, unless neither the
                     # previous nor the new character are cased.
-                    if s.case_flags or is_cased(info, s.value):
+                    if s.case_flags or is_cased_i(info, s.value):
                         Sequence._flush_characters(info, characters,
                           case_flags, items)
 
@@ -3366,7 +3367,7 @@ class Sequence(RegexBase):
                 if s.case_flags != case_flags:
                     # Different case sensitivity, so flush, unless the neither
                     # the previous nor the new string are cased.
-                    if s.case_flags or any(is_cased(info, c) for c in
+                    if s.case_flags or any(is_cased_i(info, c) for c in
                       characters):
                         Sequence._flush_characters(info, characters,
                           case_flags, items)
@@ -3423,7 +3424,7 @@ class Sequence(RegexBase):
 
         return code
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         for s in self.items:
             s.dump(indent, reverse)
 
@@ -3434,16 +3435,82 @@ class Sequence(RegexBase):
 
         # Disregard case_flags if all of the characters are case-less.
         if case_flags & IGNORECASE:
-            if not any(is_cased(info, c) for c in characters):
+            if not any(is_cased_i(info, c) for c in characters):
                 case_flags = NOCASE
 
-        if len(characters) == 1:
-            items.append(Character(characters[0], case_flags=case_flags))
+        if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
+            literals = Sequence._fix_full_casefold(characters)
+
+            for item in literals:
+                chars = item.characters
+
+                if len(chars) == 1:
+                    items.append(Character(chars[0], case_flags=item.case_flags))
+                else:
+                    items.append(String(chars, case_flags=item.case_flags))
         else:
-            items.append(String(characters, case_flags=case_flags))
+            if len(characters) == 1:
+                items.append(Character(characters[0], case_flags=case_flags))
+            else:
+                items.append(String(characters, case_flags=case_flags))
 
         characters[:] = []
 
+    @staticmethod
+    def _fix_full_casefold(characters):
+        # Split a literal needing full case-folding into chunks that need it
+        # and chunks that can use simple case-folding, which is faster.
+        expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
+          _regex.get_expand_on_folding()]
+        string = _regex.fold_case(FULL_CASE_FOLDING, u''.join(unichr(c)
+          for c in characters)).lower()
+        chunks = []
+
+        for e in expanded:
+            found = string.find(e)
+
+            while found >= 0:
+                chunks.append((found, found + len(e)))
+                found = string.find(e, found + 1)
+
+        pos = 0
+        literals = []
+
+        for start, end in Sequence._merge_chunks(chunks):
+            if pos < start:
+                literals.append(Literal(characters[pos : start],
+                  case_flags=IGNORECASE))
+
+            literals.append(Literal(characters[start : end],
+              case_flags=FULLIGNORECASE))
+            pos = end
+
+        if pos < len(characters):
+            literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
+
+        return literals
+
+    @staticmethod
+    def _merge_chunks(chunks):
+        if len(chunks) < 2:
+            return chunks
+
+        chunks.sort()
+
+        start, end = chunks[0]
+        new_chunks = []
+
+        for s, e in chunks[1 : ]:
+            if s <= end:
+                end = max(end, e)
+            else:
+                new_chunks.append((start, end))
+                start, end = s, e
+
+        new_chunks.append((start, end))
+
+        return new_chunks
+
     def is_empty(self):
         return all(i.is_empty() for i in self.items)
 
@@ -3509,7 +3576,7 @@ class SetBase(RegexBase):
 
         return code
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%s%s %s%s" % (INDENT * indent, self._op_name,
           POS_TEXT[self.positive], CASE_TEXT[self.case_flags])
         for i in self.items:
@@ -3789,7 +3856,7 @@ class String(RegexBase):
         return [(self._opcode[self.case_flags, reverse], flags,
           len(self.folded_characters)) + self.folded_characters]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu")
         print "%sSTRING %s%s" % (INDENT * indent, display,
           CASE_TEXT[self.case_flags])
@@ -3801,11 +3868,11 @@ class String(RegexBase):
         return 0, self
 
 class Literal(String):
-    def _dump(self, indent, reverse):
-        for c in self.characters:
-            display = repr(unichr(c)).lstrip("bu")
-            print "%sCHARACTER MATCH %s%s" % (INDENT * indent, display,
-              CASE_TEXT[self.case_flags])
+    def dump(self, indent, reverse):
+        literal = ''.join(unichr(c) for c in self.characters)
+        display = repr(literal).lstrip("bu")
+        print "%sLITERAL MATCH %s%s" % (INDENT * indent, display,
+          CASE_TEXT[self.case_flags])
 
 class StringSet(RegexBase):
     _opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False):
@@ -3862,7 +3929,7 @@ class StringSet(RegexBase):
             return [(self._opcode[case_flags, reverse], index, min_len,
               max_len)]
 
-    def _dump(self, indent, reverse):
+    def dump(self, indent, reverse):
         print "%sSTRING_SET %s%s" % (INDENT * indent, self.name,
           CASE_TEXT[self.case_flags])
 
diff --git a/Python2/regex.py b/Python2/regex.py
index b34a14c..b29cbbd 100644
--- a/Python2/regex.py
+++ b/Python2/regex.py
@@ -239,7 +239,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
   "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W",
   "WORD", "error", "Regex"]
 
-__version__ = "2.4.113"
+__version__ = "2.4.117"
 
 # --------------------------------------------------------------------
 # Public interface.
diff --git a/Python2/test_regex.py b/Python2/test_regex.py
index 982eed7..b1fd510 100644
--- a/Python2/test_regex.py
+++ b/Python2/test_regex.py
@@ -3717,6 +3717,12 @@ thing
         self.assertEqual(regex.search(r'a?yz', 'xxxxyz', flags=regex.FULLCASE |
           regex.IGNORECASE).span(), (4, 6))
 
+        # Hg issue 230: Is it a bug of (?(DEFINE)...)
+        self.assertEqual(regex.findall(r'(?:(?![a-d]).)+', 'abcdefgh'),
+          ['efgh'])
+        self.assertEqual(regex.findall(r'''(?(DEFINE)(?P<mydef>(?:(?![a-d]).)))(?&mydef)+''',
+          'abcdefgh'), ['efgh'])
+
     def test_subscripted_captures(self):
         self.assertEqual(regex.match(r'(?P<x>.)+',
           'abc').expandf('{0} {0[0]} {0[-1]}'), 'abc abc abc')
diff --git a/Python3/_regex.c b/Python3/_regex.c
index ad468b6..3a6648c 100644
--- a/Python3/_regex.c
+++ b/Python3/_regex.c
@@ -640,6 +640,7 @@ typedef struct PatternObject {
     RE_Node* start_test;
     size_t true_group_count; /* The true number of capture groups. */
     size_t public_group_count; /* The number of public capture groups. */
+    size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
     size_t repeat_count; /* The number of repeats. */
     Py_ssize_t group_end_index; /* The number of group closures. */
     PyObject* groupindex;
@@ -735,6 +736,7 @@ typedef struct RE_CompileArgs {
     RE_Node* start; /* The start node. */
     RE_Node* end; /* The end node. */
     size_t repeat_depth; /* The nesting depth of the repeat. */
+    size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
     BOOL forward; /* Whether it's a forward (not reverse) pattern. */
     BOOL visible_captures; /* Whether all of the captures will be visible. */
     BOOL has_captures; /* Whether the pattern has capture groups. */
@@ -742,6 +744,7 @@ typedef struct RE_CompileArgs {
     BOOL within_fuzzy; /* Whether the subpattern is within a fuzzy section. */
     BOOL has_groups; /* Whether the subpattern contains captures. */
     BOOL has_repeats; /* Whether the subpattern contains repeats. */
+    BOOL in_define; /* Whether we're in (?(DEFINE)...). */
 } RE_CompileArgs;
 
 /* The string slices which will be concatenated to make the result string of
@@ -2415,6 +2418,31 @@ Py_LOCAL_INLINE(BOOL) same_char_ign(RE_EncodingTable* encoding, RE_LocaleInfo*
     return FALSE;
 }
 
+/* Checks whether 2 characters are the same, ignoring case. The first character
+ * is already case-folded or is a possible Turkic 'I'.
+ */
+Py_LOCAL_INLINE(BOOL) same_char_ign_turkic(RE_EncodingTable* encoding,
+  RE_LocaleInfo* locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
+    int count;
+    Py_UCS4 cases[RE_MAX_CASES];
+    int i;
+
+    if (ch1 == ch2)
+        return TRUE;
+
+    if (!encoding->possible_turkic(locale_info, ch1))
+        return FALSE;
+
+    count = encoding->all_turkic_i(locale_info, ch1, cases);
+
+    for (i = 1; i < count; i++) {
+        if (cases[i] == ch2)
+            return TRUE;
+    }
+
+    return FALSE;
+}
+
 /* Wrapper for calling 'same_char' via a pointer. */
 static BOOL same_char_ign_wrapper(RE_EncodingTable* encoding, RE_LocaleInfo*
   locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
@@ -6215,7 +6243,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
     Py_ssize_t length;
     Py_ssize_t s_pos;
     Py_UCS4 folded[RE_MAX_FOLDED];
-
     state = safe_state->re_state;
     encoding = state->encoding;
     locale_info = state->locale_info;
@@ -6250,7 +6277,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
             f_pos = 0;
         }
 
-        if (s_pos < length && same_char_ign(encoding, locale_info,
+        if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
           values[s_pos], folded[f_pos])) {
             ++s_pos;
             ++f_pos;
@@ -6291,7 +6318,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
     Py_ssize_t length;
     Py_ssize_t s_pos;
     Py_UCS4 folded[RE_MAX_FOLDED];
-
     state = safe_state->re_state;
     encoding = state->encoding;
     locale_info = state->locale_info;
@@ -6325,7 +6351,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
             f_pos = 0;
         }
 
-        if (s_pos < length && same_char_ign(encoding, locale_info,
+        if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
           values[length - s_pos - 1], folded[folded_len - f_pos - 1])) {
             ++s_pos;
             ++f_pos;
@@ -7026,8 +7052,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD(RE_State* state, RE_NextNode* next,
             f_pos = 0;
         }
 
-        if (!same_char_ign(encoding, locale_info, folded[f_pos],
-          values[s_pos]))
+        if (!same_char_ign(encoding, locale_info, values[s_pos],
+          folded[f_pos]))
             return RE_ERROR_FAILURE;
 
         ++s_pos;
@@ -7099,8 +7125,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD_REV(RE_State* state, RE_NextNode*
             f_pos = 0;
         }
 
-        if (!same_char_ign(encoding, locale_info, folded[folded_len - f_pos -
-          1], values[length - s_pos - 1]))
+        if (!same_char_ign(encoding, locale_info, values[length - s_pos - 1],
+          folded[folded_len - f_pos - 1]))
             return RE_ERROR_FAILURE;
 
         ++s_pos;
@@ -13551,8 +13577,8 @@ advance:
 
                 if (folded_pos < folded_len && same_char_ign(encoding,
                   locale_info,
-                   folded[folded_pos],
-                   gfolded[gfolded_pos])) {
+                   gfolded[gfolded_pos],
+                   folded[folded_pos])) {
                     ++folded_pos;
                     ++gfolded_pos;
                 } else if (node->status & RE_STATUS_FUZZY) {
@@ -13655,8 +13681,8 @@ advance:
                 }
 
                 if (folded_pos > 0 && same_char_ign(encoding, locale_info,
-                   folded[folded_pos - 1],
-                   gfolded[gfolded_pos - 1])) {
+                   gfolded[gfolded_pos - 1],
+                   folded[folded_pos - 1])) {
                     --folded_pos;
                     --gfolded_pos;
                 } else if (node->status & RE_STATUS_FUZZY) {
@@ -14223,7 +14249,7 @@ advance:
                     }
 
                     if (folded_pos < folded_len && same_char_ign(encoding,
-                      locale_info, folded[folded_pos], values[string_pos])) {
+                      locale_info, values[string_pos], folded[folded_pos])) {
                         ++string_pos;
                         ++folded_pos;
 
@@ -14335,7 +14361,7 @@ advance:
                     }
 
                     if (folded_pos > 0 && same_char_ign(encoding, locale_info,
-                      folded[folded_pos - 1], values[string_pos - 1])) {
+                      values[string_pos - 1], folded[folded_pos - 1])) {
                         --string_pos;
                         --folded_pos;
 
@@ -15569,6 +15595,7 @@ backtrack:
             RE_Node* repeated;
             RE_Node* test;
             BOOL match;
+            Py_ssize_t skip_pos;
             BOOL m;
             size_t index;
             TRACE(("%s\n", re_op_text[bt_data->op]))
@@ -15598,7 +15625,7 @@ backtrack:
             index = node->values[0];
 
             match = FALSE;
-
+            skip_pos = -1;
             if (test->status & RE_STATUS_FUZZY) {
                 for (;;) {
                     RE_Position next_position;
@@ -15895,6 +15922,7 @@ backtrack:
                         if (!is_repeat_guarded(safe_state, index, pos,
                           RE_STATUS_TAIL)) {
                             match = TRUE;
+                            skip_pos = new_pos;
                             break;
                         }
                     }
@@ -15952,6 +15980,7 @@ backtrack:
                         if (!is_repeat_guarded(safe_state, index, pos,
                           RE_STATUS_TAIL)) {
                             match = TRUE;
+                            skip_pos = new_pos;
                             break;
                         }
                     }
@@ -16194,6 +16223,12 @@ backtrack:
                 }
 
                 node = node->next_1.node;
+
+                if (skip_pos >= 0) {
+                    state->text_pos = skip_pos;
+                    node = node->next_1.node;
+                }
+
                 goto advance;
             } else {
                 /* The tail couldn't match. */
@@ -21168,7 +21203,7 @@ static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject*
             break;
 
         /* Don't bother to build a MatchObject. */
-        switch (self->public_group_count) {
+        switch (self->visible_capture_count) {
         case 0:
             if (state.reverse) {
                 b = state.text_pos;
@@ -22898,6 +22933,7 @@ Py_LOCAL_INLINE(int) build_FUZZY(RE_CompileArgs* args) {
     args->is_fuzzy = TRUE;
     args->has_groups |= subargs.has_groups;
     args->has_repeats |= subargs.has_repeats;
+    args->visible_capture_count = subargs.visible_capture_count;
 
... 549 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-regex.git



More information about the Python-modules-commits mailing list