[Python-modules-commits] [python-regex] 01/04: Import python-regex_0.1.20160605.orig.tar.gz

Sandro Tosi morph at moszumanska.debian.org
Sat Jun 11 18:25:20 UTC 2016


This is an automated email from the git hooks/post-receive script.

morph pushed a commit to branch master
in repository python-regex.

commit e5fde42a75e8bb5ca6da3df100321f7ba9c6e6e2
Author: Sandro Tosi <morph at debian.org>
Date:   Sat Jun 11 19:20:45 2016 +0100

    Import python-regex_0.1.20160605.orig.tar.gz
---
 PKG-INFO               |   4 +-
 Python2/_regex.c       | 229 ++++++++++++++++++++++++++++++------------------
 Python2/_regex_core.py |  46 ++++++----
 Python2/regex.py       |   2 +-
 Python2/test_regex.py  |  53 +++++++++++
 Python3/_regex.c       | 233 +++++++++++++++++++++++++++++++------------------
 Python3/_regex_core.py |  46 ++++++----
 Python3/regex.py       |   2 +-
 Python3/test_regex.py  |  53 +++++++++++
 docs/Features.rst      |   2 +
 setup.py               |   2 +-
 11 files changed, 464 insertions(+), 208 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index 1afff98..f9efcbe 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: regex
-Version: 2016.04.25
+Version: 2016.06.05
 Summary: Alternative regular expression module, to replace re.
 Home-page: https://bitbucket.org/mrabarnett/mrab-regex
 Author: Matthew Barnett
@@ -148,6 +148,8 @@ Description: Introduction
         
         The issue numbers relate to the Python bug tracker, except where listed as "Hg issue".
         
+        * Fixed support for pickling compiled regexes (Hg issue 195)
+        
         * Added support for lookaround in conditional pattern (Hg issue 163)
         
           The test of a conditional pattern can now be a lookaround.
diff --git a/Python2/_regex.c b/Python2/_regex.c
index e3effcc..eda58b7 100644
--- a/Python2/_regex.c
+++ b/Python2/_regex.c
@@ -380,6 +380,7 @@ typedef struct RE_AtomicData {
     RE_BacktrackData* backtrack;
     struct RE_SavedGroups* saved_groups;
     struct RE_SavedRepeats* saved_repeats;
+    struct RE_GroupCallFrame* call_frame;
     Py_ssize_t slice_start;
     Py_ssize_t slice_end;
     Py_ssize_t text_pos;
@@ -644,7 +645,7 @@ typedef struct PatternObject {
     PyObject_HEAD
     PyObject* pattern; /* Pattern source (or None). */
     Py_ssize_t flags; /* Flags used when compiling pattern source. */
-    RE_UINT8* packed_code_list;
+    PyObject* packed_code_list;
     PyObject* weakreflist; /* List of weak references */
     /* Nodes into which the regular expression is compiled. */
     RE_Node* start_node;
@@ -2770,10 +2771,30 @@ Py_LOCAL_INLINE(void) clear_groups(RE_State* state) {
     }
 }
 
+/* Resets the various guards. */
+Py_LOCAL_INLINE(void) reset_guards(RE_State* state) {
+    size_t i;
+
+    /* Reset the guards for the repeats. */
+    for (i = 0; i < state->pattern->repeat_count; i++) {
+        reset_guard_list(&state->repeats[i].body_guard_list);
+        reset_guard_list(&state->repeats[i].tail_guard_list);
+    }
+
+    /* Reset the guards for the fuzzy sections. */
+    for (i = 0; i < state->pattern->fuzzy_count; i++) {
+        reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
+        reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
+    }
+
+    /* Reset the guards for the group calls. */
+    for (i = 0; i < state->pattern->call_ref_info_count; i++)
+        reset_guard_list(&state->group_call_guard_list[i]);
+}
+
 /* Initialises the state for a match. */
 Py_LOCAL_INLINE(void) init_match(RE_State* state) {
     RE_AtomicBlock* current;
-    size_t i;
 
     /* Reset the backtrack. */
     state->current_backtrack_block = &state->backtrack_block;
@@ -2793,24 +2814,11 @@ Py_LOCAL_INLINE(void) init_match(RE_State* state) {
         state->current_atomic_block->count = 0;
     }
 
-    /* Reset the guards for the repeats. */
-    for (i = 0; i < state->pattern->repeat_count; i++) {
-        reset_guard_list(&state->repeats[i].body_guard_list);
-        reset_guard_list(&state->repeats[i].tail_guard_list);
-    }
-
-    /* Reset the guards for the fuzzy sections. */
-    for (i = 0; i < state->pattern->fuzzy_count; i++) {
-        reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
-        reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
-    }
-
     /* Clear the groups. */
     clear_groups(state);
 
-    /* Reset the guards for the group calls. */
-    for (i = 0; i < state->pattern->call_ref_info_count; i++)
-        reset_guard_list(&state->group_call_guard_list[i]);
+    /* Reset the guards. */
+    reset_guards(state);
 
     /* Clear the counts and cost for matching. */
     if (state->pattern->is_fuzzy) {
@@ -9138,18 +9146,27 @@ Py_LOCAL_INLINE(BOOL) is_repeat_guarded(RE_SafeState* safe_state, size_t index,
 }
 
 /* Builds a Unicode string. */
-Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len,
-  Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t start,
+  Py_ssize_t end, Py_ssize_t buffer_charsize) {
+    Py_ssize_t len;
+
+    buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+    len = end - start;
+
     return PyUnicode_FromUnicode(buffer, len);
 }
 
 /* Builds a bytestring. Returns NULL if any member is too wide. */
-Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t len,
-  Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t start,
+  Py_ssize_t end, Py_ssize_t buffer_charsize) {
+    Py_ssize_t len;
     Py_UCS1* byte_buffer;
     Py_ssize_t i;
     PyObject* result;
 
+    buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+    len = end - start;
+
     if (buffer_charsize == 1)
         return Py_BuildValue("s#", buffer, len);
 
@@ -9184,11 +9201,10 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
     int status;
 
     if (state->is_unicode)
-        string = build_unicode_value(state->point_to(state->text, first), last
-          - first, state->charsize);
+        string = build_unicode_value(state->text, first, last,
+          state->charsize);
     else
-        string = build_bytes_value(state->point_to(state->text, first), last -
-          first, state->charsize);
+        string = build_bytes_value(state->text, first, last, state->charsize);
     if (!string)
         return RE_ERROR_INTERNAL;
 
@@ -9200,8 +9216,8 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
 
 /* Looks for a string in a string set, ignoring case. */
 Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
-  string_set, void* buffer, Py_ssize_t index, Py_ssize_t len, Py_ssize_t
-  buffer_charsize) {
+  string_set, void* buffer, Py_ssize_t first, Py_ssize_t last, Py_ssize_t
+   index, Py_ssize_t buffer_charsize) {
     Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
     void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
     RE_EncodingTable* encoding;
@@ -9233,11 +9249,11 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
     possible_turkic = encoding->possible_turkic;
 
     /* Look for a possible Turkic 'I'. */
-    while (index < len && !possible_turkic(locale_info, char_at(buffer,
+    while (index < last && !possible_turkic(locale_info, char_at(buffer,
       index)))
         ++index;
 
-    if (index < len) {
+    if (index < last) {
         /* Possible Turkic 'I'. */
         int count;
         int i;
@@ -9252,8 +9268,8 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
             set_char_at(buffer, index, codepoints[i]);
 
             /* Recurse for the remainder of the string. */
-            status = string_set_contains_ign(state, string_set, buffer, index +
-              1, len, buffer_charsize);
+            status = string_set_contains_ign(state, string_set, buffer, first,
+              last, index + 1, buffer_charsize);
             if (status != 0)
                 return status;
         }
@@ -9265,9 +9281,9 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
         int status;
 
         if (state->is_unicode)
-            string = build_unicode_value(buffer, len, buffer_charsize);
+            string = build_unicode_value(buffer, first, last, buffer_charsize);
         else
-            string = build_bytes_value(buffer, len, buffer_charsize);
+            string = build_bytes_value(buffer, first, last, buffer_charsize);
         if (!string)
             return RE_ERROR_MEMORY;
 
@@ -9531,7 +9547,6 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
     Py_ssize_t first;
     Py_ssize_t last;
     PyObject* string_set;
-    void* folded_buffer;
 
     state = safe_state->re_state;
     full_case_fold = state->encoding->full_case_fold;
@@ -9651,7 +9666,7 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
 
         /* Is the text we have a partial match? */
         status = string_set_contains_ign(state, string_set, folded, first,
-          last, folded_charsize);
+          last, first, folded_charsize);
         if (status < 0)
             goto finished;
 
@@ -9675,18 +9690,13 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
         goto finished;
     }
 
-    /* Point to the used portion of the folded buffer. */
-    folded_buffer = (void*)((Py_UCS1*)folded + first * folded_charsize);
-    last -= first;
-    first = 0;
-
     /* We've already looked for a partial match (if allowed), but what about a
      * complete match?
      */
     while (len >= min_len) {
         if (end_of_fold[len]) {
-            status = string_set_contains_ign(state, string_set, folded_buffer,
-              first, last, folded_charsize);
+            status = string_set_contains_ign(state, string_set, folded, first,
+              last, first, folded_charsize);
 
             if (status == 1) {
                 /* Advance past the match. */
@@ -9810,7 +9820,7 @@ Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state,
     }
 
     if (reverse) {
-        first = f_pos;
+        first = f_pos + 1;
         last = max_len;
     } else {
         first = 0;
@@ -9839,7 +9849,7 @@ Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state,
 
         /* Is the text we have a partial match? */
         status = string_set_contains_ign(state, string_set, folded, first,
-          last, folded_charsize);
+          last, first, folded_charsize);
         if (status < 0)
             goto finished;
 
@@ -9868,7 +9878,7 @@ Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state,
      */
     while (len >= min_len) {
         status = string_set_contains_ign(state, string_set, folded, first,
-          last, folded_charsize);
+          last, first, folded_charsize);
 
         if (status == 1) {
             /* Advance past the match. */
@@ -11683,6 +11693,7 @@ advance:
             atomic->is_lookaround = FALSE;
             atomic->has_groups = (node->status & RE_STATUS_HAS_GROUPS) != 0;
             atomic->has_repeats = (node->status & RE_STATUS_HAS_REPEATS) != 0;
+            atomic->call_frame = state->current_group_call_frame;
 
             /* Save the groups and repeats. */
             if (atomic->has_groups && !push_groups(safe_state))
@@ -14559,7 +14570,7 @@ backtrack:
         case RE_OP_ATOMIC: /* Start of an atomic group. */
         {
             RE_AtomicData* atomic;
-            /* backtrack to the start of an atomic group. */
+            /* Backtrack to the start of an atomic group. */
             atomic = pop_atomic(safe_state);
 
             if (atomic->has_repeats)
@@ -14570,6 +14581,7 @@ backtrack:
 
             state->too_few_errors = bt_data->atomic.too_few_errors;
             state->capture_change = bt_data->atomic.capture_change;
+            state->current_group_call_frame = atomic->call_frame;
 
             discard_backtrack(state);
             break;
@@ -14825,6 +14837,9 @@ backtrack:
             /* Clear the groups. */
             clear_groups(state);
 
+            /* Reset the guards. */
+            reset_guards(state);
+
             goto start_match;
         }
         case RE_OP_FUZZY: /* Fuzzy matching. */
@@ -21276,7 +21291,7 @@ static void pattern_dealloc(PyObject* self_) {
     Py_DECREF(self->named_list_indexes);
     Py_DECREF(self->required_chars);
     re_dealloc(self->locale_info);
-    re_dealloc(self->packed_code_list);
+    Py_DECREF(self->packed_code_list);
     PyObject_DEL(self);
 }
 
@@ -21346,13 +21361,13 @@ Py_LOCAL_INLINE(BOOL) append_integer(PyObject* list, Py_ssize_t value) {
 }
 
 /* Packs the code list that's needed for pickling. */
-Py_LOCAL_INLINE(RE_UINT8*) pack_code_list(RE_CODE* code, Py_ssize_t code_len) {
+Py_LOCAL_INLINE(PyObject*) pack_code_list(RE_CODE* code, Py_ssize_t code_len) {
     Py_ssize_t max_size;
     RE_UINT8* packed;
     Py_ssize_t count;
     RE_UINT32 value;
     Py_ssize_t i;
-    RE_UINT8* new_packed;
+    PyObject* packed_code_list;
 
     /* What is the maximum number of bytes needed to store it?
      *
@@ -21385,17 +21400,16 @@ Py_LOCAL_INLINE(RE_UINT8*) pack_code_list(RE_CODE* code, Py_ssize_t code_len) {
         packed[count++] = value;
     }
 
-    /* Discard the unused bytes. */
-    new_packed = re_realloc(packed, count);
-    if (new_packed)
-        packed = new_packed;
+    packed_code_list = PyString_FromStringAndSize((const char *)packed, count);
+    re_dealloc(packed);
 
-    return packed;
+    return packed_code_list;
 }
 
 /* Unpacks the code list that's needed for pickling. */
-Py_LOCAL_INLINE(PyObject*) unpack_code_list(RE_UINT8* packed) {
+Py_LOCAL_INLINE(PyObject*) unpack_code_list(PyObject* packed) {
     PyObject* code_list;
+    RE_UINT8* packed_data;
     Py_ssize_t index;
     RE_UINT32 value;
     int shift;
@@ -21405,18 +21419,19 @@ Py_LOCAL_INLINE(PyObject*) unpack_code_list(RE_UINT8* packed) {
     if (!code_list)
         return NULL;
 
+    packed_data = (RE_UINT8*)PyString_AsString(packed);
     index = 0;
 
     /* Unpack the length of the code list. */
     value = 0;
     shift = 0;
 
-    while (packed[index] >= 0x80) {
-        value |= (RE_UINT32)(packed[index++] & 0x7F) << shift;
+    while (packed_data[index] >= 0x80) {
+        value |= (RE_UINT32)(packed_data[index++] & 0x7F) << shift;
         shift += 7;
     }
 
-    value |= (RE_UINT32)packed[index++] << shift;
+    value |= (RE_UINT32)packed_data[index++] << shift;
     count = (size_t)value;
 
     /* Unpack each of the elements of the code list. */
@@ -21427,12 +21442,12 @@ Py_LOCAL_INLINE(PyObject*) unpack_code_list(RE_UINT8* packed) {
         value = 0;
         shift = 0;
 
-        while (packed[index] >= 0x80) {
-            value |= (RE_UINT32)(packed[index++] & 0x7F) << shift;
+        while (packed_data[index] >= 0x80) {
+            value |= (RE_UINT32)(packed_data[index++] & 0x7F) << shift;
             shift += 7;
         }
 
-        value |= (RE_UINT32)packed[index++] << shift;
+        value |= (RE_UINT32)packed_data[index++] << shift;
 #if PY_VERSION_HEX >= 0x02060000
         obj = PyLong_FromSize_t((size_t)value);
 #else
@@ -21658,20 +21673,15 @@ static PyObject* pattern_groupindex(PyObject* self_) {
 /* PatternObject's '_pickled_data' method. */
 static PyObject* pattern_pickled_data(PyObject* self_) {
     PatternObject* self;
-    PyObject* code_list;
     PyObject* pickled_data;
 
     self = (PatternObject*)self_;
 
-    code_list = unpack_code_list(self->packed_code_list);
-    if (!code_list)
-        return NULL;
-
     /* Build the data needed for picking. */
     pickled_data = Py_BuildValue("OnOOOOOnOnn", self->pattern, self->flags,
-      code_list, self->groupindex, self->indexgroup, self->named_lists,
-      self->named_list_indexes, self->req_offset, self->required_chars,
-      self->req_flags, self->public_group_count);
+      self->packed_code_list, self->groupindex, self->indexgroup,
+      self->named_lists, self->named_list_indexes, self->req_offset,
+      self->required_chars, self->req_flags, self->public_group_count);
 
     return pickled_data;
 }
@@ -22005,10 +22015,25 @@ Py_LOCAL_INLINE(RE_STATUS_T) add_repeat_guards(PatternObject* pattern, RE_Node*
                 node->status |= RE_STATUS_VISITED_AG | result;
                 break;
             default:
-                node->status |= RE_STATUS_VISITED_AG;
-                CheckStack_push(&stack, node->next_1.node, result);
+            {
+                RE_Node* tail;
+                BOOL visited_tail;
+                RE_STATUS_T tail_result;
+
+                tail = node->next_1.node;
+                visited_tail = (tail->status & RE_STATUS_VISITED_AG);
+
+                if (visited_tail) {
+                    tail_result = tail->status & (RE_STATUS_REPEAT |
+                      RE_STATUS_REF);
+                    node->status |= RE_STATUS_VISITED_AG | tail_result;
+                } else {
+                    CheckStack_push(&stack, node, result);
+                    CheckStack_push(&stack, node->next_1.node, result);
+                }
                 break;
             }
+            }
         }
     }
 
@@ -24237,12 +24262,13 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
     PyObject* required_chars;
     Py_ssize_t req_flags;
     size_t public_group_count;
+    BOOL unpacked;
     Py_ssize_t code_len;
     RE_CODE* code;
     Py_ssize_t i;
     RE_CODE* req_chars;
     size_t req_length;
-    RE_UINT8* packed_code_list;
+    PyObject* packed_code_list;
     PatternObject* self;
     BOOL unicode;
     BOOL locale;
@@ -24254,11 +24280,29 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
       &req_offset, &required_chars, &req_flags, &public_group_count))
         return NULL;
 
+    /* If it came from a pickled source, code_list will be a packed code list
+     * in a bytestring.
+     */
+    if (PyString_Check(code_list)) {
+        packed_code_list = code_list;
+        code_list = unpack_code_list(packed_code_list);
+        if (!code_list)
+            return NULL;
+
+        unpacked = TRUE;
+    } else
+        unpacked = FALSE;
+
     /* Read the regex code. */
     code_len = PyList_GET_SIZE(code_list);
     code = (RE_CODE*)re_alloc((size_t)code_len * sizeof(RE_CODE));
-    if (!code)
+    if (!code) {
+        if (unpacked)
+            /* code_list has been built from a packed code list. */
+            Py_DECREF(code_list);
+
         return NULL;
+    }
 
     for (i = 0; i < code_len; i++) {
         PyObject* o;
@@ -24279,20 +24323,25 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
     /* Get the required characters. */
     get_required_chars(required_chars, &req_chars, &req_length);
 
-    /* Pack the code list in case it's needed for pickling. */
-    packed_code_list = pack_code_list(code, code_len);
-    if (!packed_code_list) {
-        set_error(RE_ERROR_MEMORY, NULL);
-        re_dealloc(req_chars);
-        re_dealloc(code);
-        return NULL;
+    if (!unpacked) {
+        /* Pack the code list in case it's needed for pickling. */
+        packed_code_list = pack_code_list(code, code_len);
+        if (!packed_code_list) {
+            set_error(RE_ERROR_MEMORY, NULL);
+            re_dealloc(req_chars);
+            re_dealloc(code);
+            return NULL;
+        }
     }
 
     /* Create the PatternObject. */
     self = PyObject_NEW(PatternObject, &Pattern_Type);
     if (!self) {
         set_error(RE_ERROR_MEMORY, NULL);
-        re_dealloc(packed_code_list);
+        if (unpacked)
+            Py_DECREF(code_list);
+        else
+            Py_DECREF(packed_code_list);
         re_dealloc(req_chars);
         re_dealloc(code);
         return NULL;
@@ -24335,6 +24384,8 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
     self->req_string = NULL;
     self->locale_info = NULL;
     Py_INCREF(self->pattern);
+    if (unpacked)
+        Py_INCREF(self->packed_code_list);
     Py_INCREF(self->groupindex);
     Py_INCREF(self->indexgroup);
     Py_INCREF(self->named_lists);
@@ -24367,6 +24418,8 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
     if (!ok) {
         Py_DECREF(self);
         re_dealloc(req_chars);
+        if (unpacked)
+            Py_DECREF(code_list);
         return NULL;
     }
 
@@ -24421,17 +24474,25 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
         self->locale_info = re_alloc(sizeof(RE_LocaleInfo));
         if (!self->locale_info) {
             Py_DECREF(self);
+            if (unpacked)
+                Py_DECREF(code_list);
             return NULL;
         }
 
         scan_locale_chars(self->locale_info);
     }
 
+    if (unpacked)
+        Py_DECREF(code_list);
+
     return (PyObject*)self;
 
 error:
     re_dealloc(code);
     set_error(RE_ERROR_ILLEGAL, NULL);
+    if (unpacked)
+        Py_DECREF(code_list);
+
     return NULL;
 }
 
@@ -24587,9 +24648,9 @@ static PyObject* fold_case(PyObject* self_, PyObject* args) {
 
     /* Build the result string. */
     if (str_info.is_unicode)
-        result = build_unicode_value(folded, folded_len, folded_charsize);
+        result = build_unicode_value(folded, 0, folded_len, folded_charsize);
     else
-        result = build_bytes_value(folded, folded_len, folded_charsize);
+        result = build_bytes_value(folded, 0, folded_len, folded_charsize);
 
     re_dealloc(folded);
 
@@ -24622,7 +24683,7 @@ static PyObject* get_expand_on_folding(PyObject* self, PyObject* unused) {
 
         codepoint = re_expand_on_folding[i];
 
-        item = build_unicode_value(&codepoint, 1, sizeof(codepoint));
+        item = build_unicode_value(&codepoint, 0, 1, sizeof(codepoint));
         if (!item)
             goto error;
 
diff --git a/Python2/_regex_core.py b/Python2/_regex_core.py
index 311f8e3..5599ed4 100644
--- a/Python2/_regex_core.py
+++ b/Python2/_regex_core.py
@@ -339,28 +339,38 @@ def _flatten_code(code):
 
     return flat_code
 
+def make_case_flags(info):
+    "Makes the case flags."
+    flags = info.flags & CASE_FLAGS
+
+    # Turn off FULLCASE if ASCII is turned on.
+    if info.flags & ASCII:
+        flags &= ~FULLCASE
+
+    return flags
+
 def make_character(info, value, in_set=False):
     "Makes a character literal."
     if in_set:
         # A character set is built case-sensitively.
         return Character(value)
 
-    return Character(value, case_flags=info.flags & CASE_FLAGS)
+    return Character(value, case_flags=make_case_flags(info))
 
 def make_ref_group(info, name, position):
     "Makes a group reference."
-    return RefGroup(info, name, position, case_flags=info.flags & CASE_FLAGS)
+    return RefGroup(info, name, position, case_flags=make_case_flags(info))
 
 def make_string_set(info, name):
     "Makes a string set."
-    return StringSet(info, name, case_flags=info.flags & CASE_FLAGS)
+    return StringSet(info, name, case_flags=make_case_flags(info))
 
 def make_property(info, prop, in_set):
     "Makes a property."
     if in_set:
         return prop
 
-    return prop.with_flags(case_flags=info.flags & CASE_FLAGS)
+    return prop.with_flags(case_flags=make_case_flags(info))
 
 def _parse_pattern(source, info):
     "Parses a pattern, eg. 'a|b|c'."
@@ -518,10 +528,6 @@ def parse_limited_quantifier(source):
         # No minimum means 0 and no maximum means unlimited.
         min_count = int(min_count or 0)
         max_count = int(max_count) if max_count else None
-
-        if max_count is not None and min_count > max_count:
-            raise error("min repeat greater than max repeat", source.string,
-              saved_pos)
     else:
         if not min_count:
             source.pos = saved_pos
@@ -529,22 +535,26 @@ def parse_limited_quantifier(source):
 
         min_count = max_count = int(min_count)
 
-    if is_above_limit(min_count) or is_above_limit(max_count):
-        raise error("repeat count too big", source.string, saved_pos)
-
     if not source.match ("}"):
         source.pos = saved_pos
         return None
 
+    if is_above_limit(min_count) or is_above_limit(max_count):
+        raise error("repeat count too big", source.string, saved_pos)
+
+    if max_count is not None and min_count > max_count:
+        raise error("min repeat greater than max repeat", source.string,
+          saved_pos)
+
     return min_count, max_count
 
 def parse_fuzzy(source, ch):
     "Parses a fuzzy setting, if present."
+    saved_pos = source.pos
+
     if ch != "{":
         return None
 
-    saved_pos = source.pos
-
     constraints = {}
     try:
         parse_fuzzy_item(source, constraints)
@@ -645,10 +655,10 @@ def parse_cost_limit(source):
 def parse_constraint(source, constraints, ch):
     "Parses a constraint."
     if ch not in "deis":
-        raise error("bad fuzzy constraint", source.string, source.pos)
+        raise ParseError()
 
     if ch in constraints:
-        raise error("repeated fuzzy constraint", source.string, source.pos)
+        raise ParseError()
 
     return ch
 
@@ -674,7 +684,7 @@ def parse_cost_equation(source, constraints):
 
     max_inc = parse_fuzzy_compare(source)
     if max_inc is None:
-        raise error("missing fuzzy cost limit", source.string, source.pos)
+        raise ParseError()
 
     max_cost = int(parse_count(source))
 
@@ -709,7 +719,7 @@ def parse_literal_and_element(source, info):
     inline flag or None if it has reached the end of a sequence.
     """
     characters = []
-    case_flags = info.flags & CASE_FLAGS
+    case_flags = make_case_flags(info)
     while True:
         saved_pos = source.pos
         ch = source.get()
@@ -1420,7 +1430,7 @@ def parse_set(source, info):
     if negate:
         item = item.with_flags(positive=not item.positive)
 
-    item = item.with_flags(case_flags=info.flags & CASE_FLAGS)
+    item = item.with_flags(case_flags=make_case_flags(info))
 
     return item
 
diff --git a/Python2/regex.py b/Python2/regex.py
index 56b1c07..10028cf 100644
--- a/Python2/regex.py
+++ b/Python2/regex.py
@@ -239,7 +239,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
   "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W",
   "WORD", "error", "Regex"]
 
-__version__ = "2.4.95"
+__version__ = "2.4.101"
 
 # --------------------------------------------------------------------
 # Public interface.
diff --git a/Python2/test_regex.py b/Python2/test_regex.py
index 871eeef..a9481fe 100644
--- a/Python2/test_regex.py
+++ b/Python2/test_regex.py
@@ -3617,6 +3617,59 @@ thing
           "My SSN is 999-89-76, but don't tell.", partial=True).span(), (36,
           36))
 
+        # Hg issue 204: confusion of (?aif) flags
+        upper_i = u'\N{CYRILLIC CAPITAL LETTER SHORT I}'
+        lower_i = u'\N{CYRILLIC SMALL LETTER SHORT I}'
+
+        self.assertEquals(bool(regex.match(ur'(?ui)' + upper_i,
+          lower_i)), True)
+        self.assertEquals(bool(regex.match(ur'(?ui)' + lower_i,
+          upper_i)), True)
+
+        self.assertEquals(bool(regex.match(ur'(?ai)' + upper_i,
+          lower_i)), False)
+        self.assertEquals(bool(regex.match(ur'(?ai)' + lower_i,
+          upper_i)), False)
+
+        self.assertEquals(bool(regex.match(ur'(?afi)' + upper_i,
+          lower_i)), False)
+        self.assertEquals(bool(regex.match(ur'(?afi)' + lower_i,
+          upper_i)), False)
+
+        # Hg issue 205: Named list and (?ri) flags
+        self.assertEquals(bool(regex.search(r'(?i)\L<aa>', '22', aa=['121',
+          '22'])), True)
+        self.assertEquals(bool(regex.search(r'(?ri)\L<aa>', '22', aa=['121',
+          '22'])), True)
+        self.assertEquals(bool(regex.search(r'(?fi)\L<aa>', '22', aa=['121',
+          '22'])), True)
+        self.assertEquals(bool(regex.search(r'(?fri)\L<aa>', '22', aa=['121',
+          '22'])), True)
+
+        # Hg issue 208: Named list, (?ri) flags, Backreference
+        self.assertEquals(regex.search(r'(?r)\1dog..(?<=(\L<aa>))$', 'ccdogcc',
+          aa=['bcb', 'cc']). span(), (0, 7))
+        self.assertEquals(regex.search(r'(?ir)\1dog..(?<=(\L<aa>))$',
+          'ccdogcc', aa=['bcb', 'cc']). span(), (0, 7))
+
+        # Hg issue 210: Fuzzy matching and Backreference
+        self.assertEquals(regex.search(r'(2)(?:\1{5}){e<=1}',
+          '3222212').span(), (1, 7))
+        self.assertEquals(regex.search(r'(\d)(?:\1{5}){e<=1}',
+          '3222212').span(), (1, 7))
+
+        # Hg issue 211: Segmentation fault with recursive matches and atomic groups
+        self.assertEquals(regex.match(r'''\A(?P<whole>(?>\((?&whole)\)|[+\-]))\Z''',
+          '((-))').span(), (0, 5))
+        self.assertEquals(regex.match(r'''\A(?P<whole>(?>\((?&whole)\)|[+\-]))\Z''',
+          '((-)+)'), None)
+
+        # Hg Issue #212: Unexpected matching difference with .*? between re and regex
+        self.assertEquals(regex.match(r"x.*? (.).*\1(.*)\1",
+          'x  |y| z|').span(), (0, 9))
+        self.assertEquals(regex.match(r"\.sr (.*?) (.)(.*)\2(.*)\2(.*)",
+          r'.sr  h |<nw>|<span class="locked">|').span(), (0, 35))
+
     def test_subscripted_captures(self):
         self.assertEqual(regex.match(r'(?P<x>.)+',
           'abc').expandf('{0} {0[0]} {0[-1]}'), 'abc abc abc')
diff --git a/Python3/_regex.c b/Python3/_regex.c
index e1a81f7..6e346c4 100644
--- a/Python3/_regex.c
+++ b/Python3/_regex.c
@@ -371,6 +371,7 @@ typedef struct RE_AtomicData {
     RE_BacktrackData* backtrack;
     struct RE_SavedGroups* saved_groups;
     struct RE_SavedRepeats* saved_repeats;
+    struct RE_GroupCallFrame* call_frame;
     Py_ssize_t slice_start;
     Py_ssize_t slice_end;
     Py_ssize_t text_pos;
@@ -631,7 +632,7 @@ typedef struct PatternObject {
     PyObject_HEAD
     PyObject* pattern; /* Pattern source (or None). */
     Py_ssize_t flags; /* Flags used when compiling pattern source. */
-    RE_UINT8* packed_code_list;
+    PyObject* packed_code_list;
     PyObject* weakreflist; /* List of weak references */
     /* Nodes into which the regular expression is compiled. */
     RE_Node* start_node;
@@ -2764,10 +2765,30 @@ Py_LOCAL_INLINE(void) clear_groups(RE_State* state) {
     }
 }
 
+/* Resets the various guards. */
+Py_LOCAL_INLINE(void) reset_guards(RE_State* state) {
+    size_t i;
+
+    /* Reset the guards for the repeats. */
+    for (i = 0; i < state->pattern->repeat_count; i++) {
+        reset_guard_list(&state->repeats[i].body_guard_list);
+        reset_guard_list(&state->repeats[i].tail_guard_list);
+    }
+
+    /* Reset the guards for the fuzzy sections. */
+    for (i = 0; i < state->pattern->fuzzy_count; i++) {
+        reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
+        reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
+    }
+
+    /* Reset the guards for the group calls. */
+    for (i = 0; i < state->pattern->call_ref_info_count; i++)
+        reset_guard_list(&state->group_call_guard_list[i]);
+}
+
 /* Initialises the state for a match. */
 Py_LOCAL_INLINE(void) init_match(RE_State* state) {
     RE_AtomicBlock* current;
-    size_t i;
 
     /* Reset the backtrack. */
     state->current_backtrack_block = &state->backtrack_block;
@@ -2787,24 +2808,11 @@ Py_LOCAL_INLINE(void) init_match(RE_State* state) {
         state->current_atomic_block->count = 0;
     }
 
-    /* Reset the guards for the repeats. */
-    for (i = 0; i < state->pattern->repeat_count; i++) {
-        reset_guard_list(&state->repeats[i].body_guard_list);
-        reset_guard_list(&state->repeats[i].tail_guard_list);
-    }
-
-    /* Reset the guards for the fuzzy sections. */
-    for (i = 0; i < state->pattern->fuzzy_count; i++) {
-        reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
-        reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
-    }
-
     /* Clear the groups. */
     clear_groups(state);
 
-    /* Reset the guards for the group calls. */
-    for (i = 0; i < state->pattern->call_ref_info_count; i++)
-        reset_guard_list(&state->group_call_guard_list[i]);
+    /* Reset the guards. */
+    reset_guards(state);
 
     /* Clear the counts and cost for matching. */
     if (state->pattern->is_fuzzy) {
@@ -9132,11 +9140,15 @@ Py_LOCAL_INLINE(BOOL) is_repeat_guarded(RE_SafeState* safe_state, size_t index,
 }
 
 /* Builds a Unicode string. */
-Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len,
-  Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t start,
+  Py_ssize_t end, Py_ssize_t buffer_charsize) {
 #if PY_VERSION_HEX >= 0x03030000
+    Py_ssize_t len;
     int kind;
 
+    buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+    len = end - start;
+
     switch (buffer_charsize) {
     case 1:
         kind = PyUnicode_1BYTE_KIND;
@@ -9154,17 +9166,26 @@ Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len,
 
     return PyUnicode_FromKindAndData(kind, buffer, len);
 #else
+    Py_ssize_t len;
+
+    buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+    len = end - start;
+
     return PyUnicode_FromUnicode(buffer, len);
 #endif
 }
 
 /* Builds a bytestring. Returns NULL if any member is too wide. */
-Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t len,
-  Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t start,
+  Py_ssize_t end, Py_ssize_t buffer_charsize) {
+    Py_ssize_t len;
     Py_UCS1* byte_buffer;
     Py_ssize_t i;
     PyObject* result;
 
+    buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+    len = end - start;
+
     if (buffer_charsize == 1)
         return Py_BuildValue("y#", buffer, len);
 
@@ -9199,11 +9220,10 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
     int status;
 
     if (state->is_unicode)
-        string = build_unicode_value(state->point_to(state->text, first), last
-          - first, state->charsize);
+        string = build_unicode_value(state->text, first, last,
+          state->charsize);
     else
-        string = build_bytes_value(state->point_to(state->text, first), last -
-          first, state->charsize);
+        string = build_bytes_value(state->text, first, last, state->charsize);
     if (!string)
         return RE_ERROR_INTERNAL;
 
@@ -9215,8 +9235,8 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
 
 /* Looks for a string in a string set, ignoring case. */
 Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
-  string_set, void* buffer, Py_ssize_t index, Py_ssize_t len, Py_ssize_t
-  buffer_charsize) {
+  string_set, void* buffer, Py_ssize_t first, Py_ssize_t last, Py_ssize_t
+   index, Py_ssize_t buffer_charsize) {
     Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
     void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
     RE_EncodingTable* encoding;
@@ -9248,11 +9268,11 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
     possible_turkic = encoding->possible_turkic;
 
     /* Look for a possible Turkic 'I'. */
-    while (index < len && !possible_turkic(locale_info, char_at(buffer,
+    while (index < last && !possible_turkic(locale_info, char_at(buffer,
       index)))
         ++index;
 
-    if (index < len) {
+    if (index < last) {
         /* Possible Turkic 'I'. */
         int count;
         int i;
@@ -9267,8 +9287,8 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
             set_char_at(buffer, index, codepoints[i]);
 
             /* Recurse for the remainder of the string. */
-            status = string_set_contains_ign(state, string_set, buffer, index +
-              1, len, buffer_charsize);
+            status = string_set_contains_ign(state, string_set, buffer, first,
+              last, index + 1, buffer_charsize);
             if (status != 0)
                 return status;
         }
@@ -9280,9 +9300,9 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
         int status;
 
         if (state->is_unicode)
-            string = build_unicode_value(buffer, len, buffer_charsize);
+            string = build_unicode_value(buffer, first, last, buffer_charsize);
         else
-            string = build_bytes_value(buffer, len, buffer_charsize);
+            string = build_bytes_value(buffer, first, last, buffer_charsize);
         if (!string)
             return RE_ERROR_MEMORY;
 
@@ -9546,7 +9566,6 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
     Py_ssize_t first;
     Py_ssize_t last;
     PyObject* string_set;
... 613 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-regex.git



More information about the Python-modules-commits mailing list