[Python-modules-commits] [python-regex] 01/04: Import python-regex_0.1.20160605.orig.tar.gz
Sandro Tosi
morph at moszumanska.debian.org
Sat Jun 11 18:25:20 UTC 2016
This is an automated email from the git hooks/post-receive script.
morph pushed a commit to branch master
in repository python-regex.
commit e5fde42a75e8bb5ca6da3df100321f7ba9c6e6e2
Author: Sandro Tosi <morph at debian.org>
Date: Sat Jun 11 19:20:45 2016 +0100
Import python-regex_0.1.20160605.orig.tar.gz
---
PKG-INFO | 4 +-
Python2/_regex.c | 229 ++++++++++++++++++++++++++++++------------------
Python2/_regex_core.py | 46 ++++++----
Python2/regex.py | 2 +-
Python2/test_regex.py | 53 +++++++++++
Python3/_regex.c | 233 +++++++++++++++++++++++++++++++------------------
Python3/_regex_core.py | 46 ++++++----
Python3/regex.py | 2 +-
Python3/test_regex.py | 53 +++++++++++
docs/Features.rst | 2 +
setup.py | 2 +-
11 files changed, 464 insertions(+), 208 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index 1afff98..f9efcbe 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: regex
-Version: 2016.04.25
+Version: 2016.06.05
Summary: Alternative regular expression module, to replace re.
Home-page: https://bitbucket.org/mrabarnett/mrab-regex
Author: Matthew Barnett
@@ -148,6 +148,8 @@ Description: Introduction
The issue numbers relate to the Python bug tracker, except where listed as "Hg issue".
+ * Fixed support for pickling compiled regexes (Hg issue 195)
+
* Added support for lookaround in conditional pattern (Hg issue 163)
The test of a conditional pattern can now be a lookaround.
diff --git a/Python2/_regex.c b/Python2/_regex.c
index e3effcc..eda58b7 100644
--- a/Python2/_regex.c
+++ b/Python2/_regex.c
@@ -380,6 +380,7 @@ typedef struct RE_AtomicData {
RE_BacktrackData* backtrack;
struct RE_SavedGroups* saved_groups;
struct RE_SavedRepeats* saved_repeats;
+ struct RE_GroupCallFrame* call_frame;
Py_ssize_t slice_start;
Py_ssize_t slice_end;
Py_ssize_t text_pos;
@@ -644,7 +645,7 @@ typedef struct PatternObject {
PyObject_HEAD
PyObject* pattern; /* Pattern source (or None). */
Py_ssize_t flags; /* Flags used when compiling pattern source. */
- RE_UINT8* packed_code_list;
+ PyObject* packed_code_list;
PyObject* weakreflist; /* List of weak references */
/* Nodes into which the regular expression is compiled. */
RE_Node* start_node;
@@ -2770,10 +2771,30 @@ Py_LOCAL_INLINE(void) clear_groups(RE_State* state) {
}
}
+/* Resets the various guards. */
+Py_LOCAL_INLINE(void) reset_guards(RE_State* state) {
+ size_t i;
+
+ /* Reset the guards for the repeats. */
+ for (i = 0; i < state->pattern->repeat_count; i++) {
+ reset_guard_list(&state->repeats[i].body_guard_list);
+ reset_guard_list(&state->repeats[i].tail_guard_list);
+ }
+
+ /* Reset the guards for the fuzzy sections. */
+ for (i = 0; i < state->pattern->fuzzy_count; i++) {
+ reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
+ reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
+ }
+
+ /* Reset the guards for the group calls. */
+ for (i = 0; i < state->pattern->call_ref_info_count; i++)
+ reset_guard_list(&state->group_call_guard_list[i]);
+}
+
/* Initialises the state for a match. */
Py_LOCAL_INLINE(void) init_match(RE_State* state) {
RE_AtomicBlock* current;
- size_t i;
/* Reset the backtrack. */
state->current_backtrack_block = &state->backtrack_block;
@@ -2793,24 +2814,11 @@ Py_LOCAL_INLINE(void) init_match(RE_State* state) {
state->current_atomic_block->count = 0;
}
- /* Reset the guards for the repeats. */
- for (i = 0; i < state->pattern->repeat_count; i++) {
- reset_guard_list(&state->repeats[i].body_guard_list);
- reset_guard_list(&state->repeats[i].tail_guard_list);
- }
-
- /* Reset the guards for the fuzzy sections. */
- for (i = 0; i < state->pattern->fuzzy_count; i++) {
- reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
- reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
- }
-
/* Clear the groups. */
clear_groups(state);
- /* Reset the guards for the group calls. */
- for (i = 0; i < state->pattern->call_ref_info_count; i++)
- reset_guard_list(&state->group_call_guard_list[i]);
+ /* Reset the guards. */
+ reset_guards(state);
/* Clear the counts and cost for matching. */
if (state->pattern->is_fuzzy) {
@@ -9138,18 +9146,27 @@ Py_LOCAL_INLINE(BOOL) is_repeat_guarded(RE_SafeState* safe_state, size_t index,
}
/* Builds a Unicode string. */
-Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len,
- Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t start,
+ Py_ssize_t end, Py_ssize_t buffer_charsize) {
+ Py_ssize_t len;
+
+ buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+ len = end - start;
+
return PyUnicode_FromUnicode(buffer, len);
}
/* Builds a bytestring. Returns NULL if any member is too wide. */
-Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t len,
- Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t start,
+ Py_ssize_t end, Py_ssize_t buffer_charsize) {
+ Py_ssize_t len;
Py_UCS1* byte_buffer;
Py_ssize_t i;
PyObject* result;
+ buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+ len = end - start;
+
if (buffer_charsize == 1)
return Py_BuildValue("s#", buffer, len);
@@ -9184,11 +9201,10 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
int status;
if (state->is_unicode)
- string = build_unicode_value(state->point_to(state->text, first), last
- - first, state->charsize);
+ string = build_unicode_value(state->text, first, last,
+ state->charsize);
else
- string = build_bytes_value(state->point_to(state->text, first), last -
- first, state->charsize);
+ string = build_bytes_value(state->text, first, last, state->charsize);
if (!string)
return RE_ERROR_INTERNAL;
@@ -9200,8 +9216,8 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
/* Looks for a string in a string set, ignoring case. */
Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
- string_set, void* buffer, Py_ssize_t index, Py_ssize_t len, Py_ssize_t
- buffer_charsize) {
+ string_set, void* buffer, Py_ssize_t first, Py_ssize_t last, Py_ssize_t
+ index, Py_ssize_t buffer_charsize) {
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
RE_EncodingTable* encoding;
@@ -9233,11 +9249,11 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
possible_turkic = encoding->possible_turkic;
/* Look for a possible Turkic 'I'. */
- while (index < len && !possible_turkic(locale_info, char_at(buffer,
+ while (index < last && !possible_turkic(locale_info, char_at(buffer,
index)))
++index;
- if (index < len) {
+ if (index < last) {
/* Possible Turkic 'I'. */
int count;
int i;
@@ -9252,8 +9268,8 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
set_char_at(buffer, index, codepoints[i]);
/* Recurse for the remainder of the string. */
- status = string_set_contains_ign(state, string_set, buffer, index +
- 1, len, buffer_charsize);
+ status = string_set_contains_ign(state, string_set, buffer, first,
+ last, index + 1, buffer_charsize);
if (status != 0)
return status;
}
@@ -9265,9 +9281,9 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
int status;
if (state->is_unicode)
- string = build_unicode_value(buffer, len, buffer_charsize);
+ string = build_unicode_value(buffer, first, last, buffer_charsize);
else
- string = build_bytes_value(buffer, len, buffer_charsize);
+ string = build_bytes_value(buffer, first, last, buffer_charsize);
if (!string)
return RE_ERROR_MEMORY;
@@ -9531,7 +9547,6 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
Py_ssize_t first;
Py_ssize_t last;
PyObject* string_set;
- void* folded_buffer;
state = safe_state->re_state;
full_case_fold = state->encoding->full_case_fold;
@@ -9651,7 +9666,7 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
/* Is the text we have a partial match? */
status = string_set_contains_ign(state, string_set, folded, first,
- last, folded_charsize);
+ last, first, folded_charsize);
if (status < 0)
goto finished;
@@ -9675,18 +9690,13 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
goto finished;
}
- /* Point to the used portion of the folded buffer. */
- folded_buffer = (void*)((Py_UCS1*)folded + first * folded_charsize);
- last -= first;
- first = 0;
-
/* We've already looked for a partial match (if allowed), but what about a
* complete match?
*/
while (len >= min_len) {
if (end_of_fold[len]) {
- status = string_set_contains_ign(state, string_set, folded_buffer,
- first, last, folded_charsize);
+ status = string_set_contains_ign(state, string_set, folded, first,
+ last, first, folded_charsize);
if (status == 1) {
/* Advance past the match. */
@@ -9810,7 +9820,7 @@ Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state,
}
if (reverse) {
- first = f_pos;
+ first = f_pos + 1;
last = max_len;
} else {
first = 0;
@@ -9839,7 +9849,7 @@ Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state,
/* Is the text we have a partial match? */
status = string_set_contains_ign(state, string_set, folded, first,
- last, folded_charsize);
+ last, first, folded_charsize);
if (status < 0)
goto finished;
@@ -9868,7 +9878,7 @@ Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state,
*/
while (len >= min_len) {
status = string_set_contains_ign(state, string_set, folded, first,
- last, folded_charsize);
+ last, first, folded_charsize);
if (status == 1) {
/* Advance past the match. */
@@ -11683,6 +11693,7 @@ advance:
atomic->is_lookaround = FALSE;
atomic->has_groups = (node->status & RE_STATUS_HAS_GROUPS) != 0;
atomic->has_repeats = (node->status & RE_STATUS_HAS_REPEATS) != 0;
+ atomic->call_frame = state->current_group_call_frame;
/* Save the groups and repeats. */
if (atomic->has_groups && !push_groups(safe_state))
@@ -14559,7 +14570,7 @@ backtrack:
case RE_OP_ATOMIC: /* Start of an atomic group. */
{
RE_AtomicData* atomic;
- /* backtrack to the start of an atomic group. */
+ /* Backtrack to the start of an atomic group. */
atomic = pop_atomic(safe_state);
if (atomic->has_repeats)
@@ -14570,6 +14581,7 @@ backtrack:
state->too_few_errors = bt_data->atomic.too_few_errors;
state->capture_change = bt_data->atomic.capture_change;
+ state->current_group_call_frame = atomic->call_frame;
discard_backtrack(state);
break;
@@ -14825,6 +14837,9 @@ backtrack:
/* Clear the groups. */
clear_groups(state);
+ /* Reset the guards. */
+ reset_guards(state);
+
goto start_match;
}
case RE_OP_FUZZY: /* Fuzzy matching. */
@@ -21276,7 +21291,7 @@ static void pattern_dealloc(PyObject* self_) {
Py_DECREF(self->named_list_indexes);
Py_DECREF(self->required_chars);
re_dealloc(self->locale_info);
- re_dealloc(self->packed_code_list);
+ Py_DECREF(self->packed_code_list);
PyObject_DEL(self);
}
@@ -21346,13 +21361,13 @@ Py_LOCAL_INLINE(BOOL) append_integer(PyObject* list, Py_ssize_t value) {
}
/* Packs the code list that's needed for pickling. */
-Py_LOCAL_INLINE(RE_UINT8*) pack_code_list(RE_CODE* code, Py_ssize_t code_len) {
+Py_LOCAL_INLINE(PyObject*) pack_code_list(RE_CODE* code, Py_ssize_t code_len) {
Py_ssize_t max_size;
RE_UINT8* packed;
Py_ssize_t count;
RE_UINT32 value;
Py_ssize_t i;
- RE_UINT8* new_packed;
+ PyObject* packed_code_list;
/* What is the maximum number of bytes needed to store it?
*
@@ -21385,17 +21400,16 @@ Py_LOCAL_INLINE(RE_UINT8*) pack_code_list(RE_CODE* code, Py_ssize_t code_len) {
packed[count++] = value;
}
- /* Discard the unused bytes. */
- new_packed = re_realloc(packed, count);
- if (new_packed)
- packed = new_packed;
+ packed_code_list = PyString_FromStringAndSize((const char *)packed, count);
+ re_dealloc(packed);
- return packed;
+ return packed_code_list;
}
/* Unpacks the code list that's needed for pickling. */
-Py_LOCAL_INLINE(PyObject*) unpack_code_list(RE_UINT8* packed) {
+Py_LOCAL_INLINE(PyObject*) unpack_code_list(PyObject* packed) {
PyObject* code_list;
+ RE_UINT8* packed_data;
Py_ssize_t index;
RE_UINT32 value;
int shift;
@@ -21405,18 +21419,19 @@ Py_LOCAL_INLINE(PyObject*) unpack_code_list(RE_UINT8* packed) {
if (!code_list)
return NULL;
+ packed_data = (RE_UINT8*)PyString_AsString(packed);
index = 0;
/* Unpack the length of the code list. */
value = 0;
shift = 0;
- while (packed[index] >= 0x80) {
- value |= (RE_UINT32)(packed[index++] & 0x7F) << shift;
+ while (packed_data[index] >= 0x80) {
+ value |= (RE_UINT32)(packed_data[index++] & 0x7F) << shift;
shift += 7;
}
- value |= (RE_UINT32)packed[index++] << shift;
+ value |= (RE_UINT32)packed_data[index++] << shift;
count = (size_t)value;
/* Unpack each of the elements of the code list. */
@@ -21427,12 +21442,12 @@ Py_LOCAL_INLINE(PyObject*) unpack_code_list(RE_UINT8* packed) {
value = 0;
shift = 0;
- while (packed[index] >= 0x80) {
- value |= (RE_UINT32)(packed[index++] & 0x7F) << shift;
+ while (packed_data[index] >= 0x80) {
+ value |= (RE_UINT32)(packed_data[index++] & 0x7F) << shift;
shift += 7;
}
- value |= (RE_UINT32)packed[index++] << shift;
+ value |= (RE_UINT32)packed_data[index++] << shift;
#if PY_VERSION_HEX >= 0x02060000
obj = PyLong_FromSize_t((size_t)value);
#else
@@ -21658,20 +21673,15 @@ static PyObject* pattern_groupindex(PyObject* self_) {
/* PatternObject's '_pickled_data' method. */
static PyObject* pattern_pickled_data(PyObject* self_) {
PatternObject* self;
- PyObject* code_list;
PyObject* pickled_data;
self = (PatternObject*)self_;
- code_list = unpack_code_list(self->packed_code_list);
- if (!code_list)
- return NULL;
-
/* Build the data needed for picking. */
pickled_data = Py_BuildValue("OnOOOOOnOnn", self->pattern, self->flags,
- code_list, self->groupindex, self->indexgroup, self->named_lists,
- self->named_list_indexes, self->req_offset, self->required_chars,
- self->req_flags, self->public_group_count);
+ self->packed_code_list, self->groupindex, self->indexgroup,
+ self->named_lists, self->named_list_indexes, self->req_offset,
+ self->required_chars, self->req_flags, self->public_group_count);
return pickled_data;
}
@@ -22005,10 +22015,25 @@ Py_LOCAL_INLINE(RE_STATUS_T) add_repeat_guards(PatternObject* pattern, RE_Node*
node->status |= RE_STATUS_VISITED_AG | result;
break;
default:
- node->status |= RE_STATUS_VISITED_AG;
- CheckStack_push(&stack, node->next_1.node, result);
+ {
+ RE_Node* tail;
+ BOOL visited_tail;
+ RE_STATUS_T tail_result;
+
+ tail = node->next_1.node;
+ visited_tail = (tail->status & RE_STATUS_VISITED_AG);
+
+ if (visited_tail) {
+ tail_result = tail->status & (RE_STATUS_REPEAT |
+ RE_STATUS_REF);
+ node->status |= RE_STATUS_VISITED_AG | tail_result;
+ } else {
+ CheckStack_push(&stack, node, result);
+ CheckStack_push(&stack, node->next_1.node, result);
+ }
break;
}
+ }
}
}
@@ -24237,12 +24262,13 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
PyObject* required_chars;
Py_ssize_t req_flags;
size_t public_group_count;
+ BOOL unpacked;
Py_ssize_t code_len;
RE_CODE* code;
Py_ssize_t i;
RE_CODE* req_chars;
size_t req_length;
- RE_UINT8* packed_code_list;
+ PyObject* packed_code_list;
PatternObject* self;
BOOL unicode;
BOOL locale;
@@ -24254,11 +24280,29 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
&req_offset, &required_chars, &req_flags, &public_group_count))
return NULL;
+ /* If it came from a pickled source, code_list will be a packed code list
+ * in a bytestring.
+ */
+ if (PyString_Check(code_list)) {
+ packed_code_list = code_list;
+ code_list = unpack_code_list(packed_code_list);
+ if (!code_list)
+ return NULL;
+
+ unpacked = TRUE;
+ } else
+ unpacked = FALSE;
+
/* Read the regex code. */
code_len = PyList_GET_SIZE(code_list);
code = (RE_CODE*)re_alloc((size_t)code_len * sizeof(RE_CODE));
- if (!code)
+ if (!code) {
+ if (unpacked)
+ /* code_list has been built from a packed code list. */
+ Py_DECREF(code_list);
+
return NULL;
+ }
for (i = 0; i < code_len; i++) {
PyObject* o;
@@ -24279,20 +24323,25 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
/* Get the required characters. */
get_required_chars(required_chars, &req_chars, &req_length);
- /* Pack the code list in case it's needed for pickling. */
- packed_code_list = pack_code_list(code, code_len);
- if (!packed_code_list) {
- set_error(RE_ERROR_MEMORY, NULL);
- re_dealloc(req_chars);
- re_dealloc(code);
- return NULL;
+ if (!unpacked) {
+ /* Pack the code list in case it's needed for pickling. */
+ packed_code_list = pack_code_list(code, code_len);
+ if (!packed_code_list) {
+ set_error(RE_ERROR_MEMORY, NULL);
+ re_dealloc(req_chars);
+ re_dealloc(code);
+ return NULL;
+ }
}
/* Create the PatternObject. */
self = PyObject_NEW(PatternObject, &Pattern_Type);
if (!self) {
set_error(RE_ERROR_MEMORY, NULL);
- re_dealloc(packed_code_list);
+ if (unpacked)
+ Py_DECREF(code_list);
+ else
+ Py_DECREF(packed_code_list);
re_dealloc(req_chars);
re_dealloc(code);
return NULL;
@@ -24335,6 +24384,8 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
self->req_string = NULL;
self->locale_info = NULL;
Py_INCREF(self->pattern);
+ if (unpacked)
+ Py_INCREF(self->packed_code_list);
Py_INCREF(self->groupindex);
Py_INCREF(self->indexgroup);
Py_INCREF(self->named_lists);
@@ -24367,6 +24418,8 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
if (!ok) {
Py_DECREF(self);
re_dealloc(req_chars);
+ if (unpacked)
+ Py_DECREF(code_list);
return NULL;
}
@@ -24421,17 +24474,25 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
self->locale_info = re_alloc(sizeof(RE_LocaleInfo));
if (!self->locale_info) {
Py_DECREF(self);
+ if (unpacked)
+ Py_DECREF(code_list);
return NULL;
}
scan_locale_chars(self->locale_info);
}
+ if (unpacked)
+ Py_DECREF(code_list);
+
return (PyObject*)self;
error:
re_dealloc(code);
set_error(RE_ERROR_ILLEGAL, NULL);
+ if (unpacked)
+ Py_DECREF(code_list);
+
return NULL;
}
@@ -24587,9 +24648,9 @@ static PyObject* fold_case(PyObject* self_, PyObject* args) {
/* Build the result string. */
if (str_info.is_unicode)
- result = build_unicode_value(folded, folded_len, folded_charsize);
+ result = build_unicode_value(folded, 0, folded_len, folded_charsize);
else
- result = build_bytes_value(folded, folded_len, folded_charsize);
+ result = build_bytes_value(folded, 0, folded_len, folded_charsize);
re_dealloc(folded);
@@ -24622,7 +24683,7 @@ static PyObject* get_expand_on_folding(PyObject* self, PyObject* unused) {
codepoint = re_expand_on_folding[i];
- item = build_unicode_value(&codepoint, 1, sizeof(codepoint));
+ item = build_unicode_value(&codepoint, 0, 1, sizeof(codepoint));
if (!item)
goto error;
diff --git a/Python2/_regex_core.py b/Python2/_regex_core.py
index 311f8e3..5599ed4 100644
--- a/Python2/_regex_core.py
+++ b/Python2/_regex_core.py
@@ -339,28 +339,38 @@ def _flatten_code(code):
return flat_code
+def make_case_flags(info):
+ "Makes the case flags."
+ flags = info.flags & CASE_FLAGS
+
+ # Turn off FULLCASE if ASCII is turned on.
+ if info.flags & ASCII:
+ flags &= ~FULLCASE
+
+ return flags
+
def make_character(info, value, in_set=False):
"Makes a character literal."
if in_set:
# A character set is built case-sensitively.
return Character(value)
- return Character(value, case_flags=info.flags & CASE_FLAGS)
+ return Character(value, case_flags=make_case_flags(info))
def make_ref_group(info, name, position):
"Makes a group reference."
- return RefGroup(info, name, position, case_flags=info.flags & CASE_FLAGS)
+ return RefGroup(info, name, position, case_flags=make_case_flags(info))
def make_string_set(info, name):
"Makes a string set."
- return StringSet(info, name, case_flags=info.flags & CASE_FLAGS)
+ return StringSet(info, name, case_flags=make_case_flags(info))
def make_property(info, prop, in_set):
"Makes a property."
if in_set:
return prop
- return prop.with_flags(case_flags=info.flags & CASE_FLAGS)
+ return prop.with_flags(case_flags=make_case_flags(info))
def _parse_pattern(source, info):
"Parses a pattern, eg. 'a|b|c'."
@@ -518,10 +528,6 @@ def parse_limited_quantifier(source):
# No minimum means 0 and no maximum means unlimited.
min_count = int(min_count or 0)
max_count = int(max_count) if max_count else None
-
- if max_count is not None and min_count > max_count:
- raise error("min repeat greater than max repeat", source.string,
- saved_pos)
else:
if not min_count:
source.pos = saved_pos
@@ -529,22 +535,26 @@ def parse_limited_quantifier(source):
min_count = max_count = int(min_count)
- if is_above_limit(min_count) or is_above_limit(max_count):
- raise error("repeat count too big", source.string, saved_pos)
-
if not source.match ("}"):
source.pos = saved_pos
return None
+ if is_above_limit(min_count) or is_above_limit(max_count):
+ raise error("repeat count too big", source.string, saved_pos)
+
+ if max_count is not None and min_count > max_count:
+ raise error("min repeat greater than max repeat", source.string,
+ saved_pos)
+
return min_count, max_count
def parse_fuzzy(source, ch):
"Parses a fuzzy setting, if present."
+ saved_pos = source.pos
+
if ch != "{":
return None
- saved_pos = source.pos
-
constraints = {}
try:
parse_fuzzy_item(source, constraints)
@@ -645,10 +655,10 @@ def parse_cost_limit(source):
def parse_constraint(source, constraints, ch):
"Parses a constraint."
if ch not in "deis":
- raise error("bad fuzzy constraint", source.string, source.pos)
+ raise ParseError()
if ch in constraints:
- raise error("repeated fuzzy constraint", source.string, source.pos)
+ raise ParseError()
return ch
@@ -674,7 +684,7 @@ def parse_cost_equation(source, constraints):
max_inc = parse_fuzzy_compare(source)
if max_inc is None:
- raise error("missing fuzzy cost limit", source.string, source.pos)
+ raise ParseError()
max_cost = int(parse_count(source))
@@ -709,7 +719,7 @@ def parse_literal_and_element(source, info):
inline flag or None if it has reached the end of a sequence.
"""
characters = []
- case_flags = info.flags & CASE_FLAGS
+ case_flags = make_case_flags(info)
while True:
saved_pos = source.pos
ch = source.get()
@@ -1420,7 +1430,7 @@ def parse_set(source, info):
if negate:
item = item.with_flags(positive=not item.positive)
- item = item.with_flags(case_flags=info.flags & CASE_FLAGS)
+ item = item.with_flags(case_flags=make_case_flags(info))
return item
diff --git a/Python2/regex.py b/Python2/regex.py
index 56b1c07..10028cf 100644
--- a/Python2/regex.py
+++ b/Python2/regex.py
@@ -239,7 +239,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
"U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W",
"WORD", "error", "Regex"]
-__version__ = "2.4.95"
+__version__ = "2.4.101"
# --------------------------------------------------------------------
# Public interface.
diff --git a/Python2/test_regex.py b/Python2/test_regex.py
index 871eeef..a9481fe 100644
--- a/Python2/test_regex.py
+++ b/Python2/test_regex.py
@@ -3617,6 +3617,59 @@ thing
"My SSN is 999-89-76, but don't tell.", partial=True).span(), (36,
36))
+ # Hg issue 204: confusion of (?aif) flags
+ upper_i = u'\N{CYRILLIC CAPITAL LETTER SHORT I}'
+ lower_i = u'\N{CYRILLIC SMALL LETTER SHORT I}'
+
+ self.assertEquals(bool(regex.match(ur'(?ui)' + upper_i,
+ lower_i)), True)
+ self.assertEquals(bool(regex.match(ur'(?ui)' + lower_i,
+ upper_i)), True)
+
+ self.assertEquals(bool(regex.match(ur'(?ai)' + upper_i,
+ lower_i)), False)
+ self.assertEquals(bool(regex.match(ur'(?ai)' + lower_i,
+ upper_i)), False)
+
+ self.assertEquals(bool(regex.match(ur'(?afi)' + upper_i,
+ lower_i)), False)
+ self.assertEquals(bool(regex.match(ur'(?afi)' + lower_i,
+ upper_i)), False)
+
+ # Hg issue 205: Named list and (?ri) flags
+ self.assertEquals(bool(regex.search(r'(?i)\L<aa>', '22', aa=['121',
+ '22'])), True)
+ self.assertEquals(bool(regex.search(r'(?ri)\L<aa>', '22', aa=['121',
+ '22'])), True)
+ self.assertEquals(bool(regex.search(r'(?fi)\L<aa>', '22', aa=['121',
+ '22'])), True)
+ self.assertEquals(bool(regex.search(r'(?fri)\L<aa>', '22', aa=['121',
+ '22'])), True)
+
+ # Hg issue 208: Named list, (?ri) flags, Backreference
+ self.assertEquals(regex.search(r'(?r)\1dog..(?<=(\L<aa>))$', 'ccdogcc',
+ aa=['bcb', 'cc']). span(), (0, 7))
+ self.assertEquals(regex.search(r'(?ir)\1dog..(?<=(\L<aa>))$',
+ 'ccdogcc', aa=['bcb', 'cc']). span(), (0, 7))
+
+ # Hg issue 210: Fuzzy matching and Backreference
+ self.assertEquals(regex.search(r'(2)(?:\1{5}){e<=1}',
+ '3222212').span(), (1, 7))
+ self.assertEquals(regex.search(r'(\d)(?:\1{5}){e<=1}',
+ '3222212').span(), (1, 7))
+
+ # Hg issue 211: Segmentation fault with recursive matches and atomic groups
+ self.assertEquals(regex.match(r'''\A(?P<whole>(?>\((?&whole)\)|[+\-]))\Z''',
+ '((-))').span(), (0, 5))
+ self.assertEquals(regex.match(r'''\A(?P<whole>(?>\((?&whole)\)|[+\-]))\Z''',
+ '((-)+)'), None)
+
+ # Hg Issue #212: Unexpected matching difference with .*? between re and regex
+ self.assertEquals(regex.match(r"x.*? (.).*\1(.*)\1",
+ 'x |y| z|').span(), (0, 9))
+ self.assertEquals(regex.match(r"\.sr (.*?) (.)(.*)\2(.*)\2(.*)",
+ r'.sr h |<nw>|<span class="locked">|').span(), (0, 35))
+
def test_subscripted_captures(self):
self.assertEqual(regex.match(r'(?P<x>.)+',
'abc').expandf('{0} {0[0]} {0[-1]}'), 'abc abc abc')
diff --git a/Python3/_regex.c b/Python3/_regex.c
index e1a81f7..6e346c4 100644
--- a/Python3/_regex.c
+++ b/Python3/_regex.c
@@ -371,6 +371,7 @@ typedef struct RE_AtomicData {
RE_BacktrackData* backtrack;
struct RE_SavedGroups* saved_groups;
struct RE_SavedRepeats* saved_repeats;
+ struct RE_GroupCallFrame* call_frame;
Py_ssize_t slice_start;
Py_ssize_t slice_end;
Py_ssize_t text_pos;
@@ -631,7 +632,7 @@ typedef struct PatternObject {
PyObject_HEAD
PyObject* pattern; /* Pattern source (or None). */
Py_ssize_t flags; /* Flags used when compiling pattern source. */
- RE_UINT8* packed_code_list;
+ PyObject* packed_code_list;
PyObject* weakreflist; /* List of weak references */
/* Nodes into which the regular expression is compiled. */
RE_Node* start_node;
@@ -2764,10 +2765,30 @@ Py_LOCAL_INLINE(void) clear_groups(RE_State* state) {
}
}
+/* Resets the various guards. */
+Py_LOCAL_INLINE(void) reset_guards(RE_State* state) {
+ size_t i;
+
+ /* Reset the guards for the repeats. */
+ for (i = 0; i < state->pattern->repeat_count; i++) {
+ reset_guard_list(&state->repeats[i].body_guard_list);
+ reset_guard_list(&state->repeats[i].tail_guard_list);
+ }
+
+ /* Reset the guards for the fuzzy sections. */
+ for (i = 0; i < state->pattern->fuzzy_count; i++) {
+ reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
+ reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
+ }
+
+ /* Reset the guards for the group calls. */
+ for (i = 0; i < state->pattern->call_ref_info_count; i++)
+ reset_guard_list(&state->group_call_guard_list[i]);
+}
+
/* Initialises the state for a match. */
Py_LOCAL_INLINE(void) init_match(RE_State* state) {
RE_AtomicBlock* current;
- size_t i;
/* Reset the backtrack. */
state->current_backtrack_block = &state->backtrack_block;
@@ -2787,24 +2808,11 @@ Py_LOCAL_INLINE(void) init_match(RE_State* state) {
state->current_atomic_block->count = 0;
}
- /* Reset the guards for the repeats. */
- for (i = 0; i < state->pattern->repeat_count; i++) {
- reset_guard_list(&state->repeats[i].body_guard_list);
- reset_guard_list(&state->repeats[i].tail_guard_list);
- }
-
- /* Reset the guards for the fuzzy sections. */
- for (i = 0; i < state->pattern->fuzzy_count; i++) {
- reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
- reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
- }
-
/* Clear the groups. */
clear_groups(state);
- /* Reset the guards for the group calls. */
- for (i = 0; i < state->pattern->call_ref_info_count; i++)
- reset_guard_list(&state->group_call_guard_list[i]);
+ /* Reset the guards. */
+ reset_guards(state);
/* Clear the counts and cost for matching. */
if (state->pattern->is_fuzzy) {
@@ -9132,11 +9140,15 @@ Py_LOCAL_INLINE(BOOL) is_repeat_guarded(RE_SafeState* safe_state, size_t index,
}
/* Builds a Unicode string. */
-Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len,
- Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t start,
+ Py_ssize_t end, Py_ssize_t buffer_charsize) {
#if PY_VERSION_HEX >= 0x03030000
+ Py_ssize_t len;
int kind;
+ buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+ len = end - start;
+
switch (buffer_charsize) {
case 1:
kind = PyUnicode_1BYTE_KIND;
@@ -9154,17 +9166,26 @@ Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len,
return PyUnicode_FromKindAndData(kind, buffer, len);
#else
+ Py_ssize_t len;
+
+ buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+ len = end - start;
+
return PyUnicode_FromUnicode(buffer, len);
#endif
}
/* Builds a bytestring. Returns NULL if any member is too wide. */
-Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t len,
- Py_ssize_t buffer_charsize) {
+Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t start,
+ Py_ssize_t end, Py_ssize_t buffer_charsize) {
+ Py_ssize_t len;
Py_UCS1* byte_buffer;
Py_ssize_t i;
PyObject* result;
+ buffer = (void*)((RE_UINT8*)buffer + start * buffer_charsize);
+ len = end - start;
+
if (buffer_charsize == 1)
return Py_BuildValue("y#", buffer, len);
@@ -9199,11 +9220,10 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
int status;
if (state->is_unicode)
- string = build_unicode_value(state->point_to(state->text, first), last
- - first, state->charsize);
+ string = build_unicode_value(state->text, first, last,
+ state->charsize);
else
- string = build_bytes_value(state->point_to(state->text, first), last -
- first, state->charsize);
+ string = build_bytes_value(state->text, first, last, state->charsize);
if (!string)
return RE_ERROR_INTERNAL;
@@ -9215,8 +9235,8 @@ Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
/* Looks for a string in a string set, ignoring case. */
Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
- string_set, void* buffer, Py_ssize_t index, Py_ssize_t len, Py_ssize_t
- buffer_charsize) {
+ string_set, void* buffer, Py_ssize_t first, Py_ssize_t last, Py_ssize_t
+ index, Py_ssize_t buffer_charsize) {
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
RE_EncodingTable* encoding;
@@ -9248,11 +9268,11 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
possible_turkic = encoding->possible_turkic;
/* Look for a possible Turkic 'I'. */
- while (index < len && !possible_turkic(locale_info, char_at(buffer,
+ while (index < last && !possible_turkic(locale_info, char_at(buffer,
index)))
++index;
- if (index < len) {
+ if (index < last) {
/* Possible Turkic 'I'. */
int count;
int i;
@@ -9267,8 +9287,8 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
set_char_at(buffer, index, codepoints[i]);
/* Recurse for the remainder of the string. */
- status = string_set_contains_ign(state, string_set, buffer, index +
- 1, len, buffer_charsize);
+ status = string_set_contains_ign(state, string_set, buffer, first,
+ last, index + 1, buffer_charsize);
if (status != 0)
return status;
}
@@ -9280,9 +9300,9 @@ Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
int status;
if (state->is_unicode)
- string = build_unicode_value(buffer, len, buffer_charsize);
+ string = build_unicode_value(buffer, first, last, buffer_charsize);
else
- string = build_bytes_value(buffer, len, buffer_charsize);
+ string = build_bytes_value(buffer, first, last, buffer_charsize);
if (!string)
return RE_ERROR_MEMORY;
@@ -9546,7 +9566,6 @@ Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
Py_ssize_t first;
Py_ssize_t last;
PyObject* string_set;
... 613 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-regex.git
More information about the Python-modules-commits
mailing list