[Python-modules-commits] [python-regex] 01/04: Import python-regex_0.1.20170117.orig.tar.gz
Sandro Tosi
morph at moszumanska.debian.org
Tue Jan 24 00:39:01 UTC 2017
This is an automated email from the git hooks/post-receive script.
morph pushed a commit to branch master
in repository python-regex.
commit cafa5a91ebc48cf82e557af185ed97774d4732fd
Author: Sandro Tosi <morph at debian.org>
Date: Mon Jan 23 19:35:20 2017 -0500
Import python-regex_0.1.20170117.orig.tar.gz
---
PKG-INFO | 2 +-
Python2/_regex.c | 101 +++++++++++++++++++++++++++--------
Python2/_regex_core.py | 139 ++++++++++++++++++++++++++++++++++++-------------
Python2/regex.py | 2 +-
Python2/test_regex.py | 6 +++
Python3/_regex.c | 101 +++++++++++++++++++++++++++--------
Python3/_regex_core.py | 139 ++++++++++++++++++++++++++++++++++++-------------
Python3/regex.py | 2 +-
Python3/test_regex.py | 6 +++
setup.py | 2 +-
10 files changed, 380 insertions(+), 120 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index 774bcd5..db1dac0 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: regex
-Version: 2016.12.27
+Version: 2017.01.17
Summary: Alternative regular expression module, to replace re.
Home-page: https://bitbucket.org/mrabarnett/mrab-regex
Author: Matthew Barnett
diff --git a/Python2/_regex.c b/Python2/_regex.c
index bd77820..1fed68d 100644
--- a/Python2/_regex.c
+++ b/Python2/_regex.c
@@ -653,6 +653,7 @@ typedef struct PatternObject {
RE_Node* start_test;
size_t true_group_count; /* The true number of capture groups. */
size_t public_group_count; /* The number of public capture groups. */
+ size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
size_t repeat_count; /* The number of repeats. */
Py_ssize_t group_end_index; /* The number of group closures. */
PyObject* groupindex;
@@ -750,6 +751,7 @@ typedef struct RE_CompileArgs {
RE_Node* start; /* The start node. */
RE_Node* end; /* The end node. */
size_t repeat_depth; /* The nesting depth of the repeat. */
+ size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
BOOL forward; /* Whether it's a forward (not reverse) pattern. */
BOOL visible_captures; /* Whether all of the captures will be visible. */
BOOL has_captures; /* Whether the pattern has capture groups. */
@@ -757,6 +759,7 @@ typedef struct RE_CompileArgs {
BOOL within_fuzzy; /* Whether the subpattern is within a fuzzy section. */
BOOL has_groups; /* Whether the subpattern contains captures. */
BOOL has_repeats; /* Whether the subpattern contains repeats. */
+ BOOL in_define; /* Whether we're in (?(DEFINE)...). */
} RE_CompileArgs;
/* The string slices which will be concatenated to make the result string of
@@ -2421,6 +2424,31 @@ Py_LOCAL_INLINE(BOOL) same_char_ign(RE_EncodingTable* encoding, RE_LocaleInfo*
return FALSE;
}
+/* Checks whether 2 characters are the same, ignoring case. The first character
+ * is already case-folded or is a possible Turkic 'I'.
+ */
+Py_LOCAL_INLINE(BOOL) same_char_ign_turkic(RE_EncodingTable* encoding,
+ RE_LocaleInfo* locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
+ int count;
+ Py_UCS4 cases[RE_MAX_CASES];
+ int i;
+
+ if (ch1 == ch2)
+ return TRUE;
+
+ if (!encoding->possible_turkic(locale_info, ch1))
+ return FALSE;
+
+ count = encoding->all_turkic_i(locale_info, ch1, cases);
+
+ for (i = 1; i < count; i++) {
+ if (cases[i] == ch2)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
/* Wrapper for calling 'same_char' via a pointer. */
static BOOL same_char_ign_wrapper(RE_EncodingTable* encoding, RE_LocaleInfo*
locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
@@ -6221,7 +6249,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
Py_ssize_t length;
Py_ssize_t s_pos;
Py_UCS4 folded[RE_MAX_FOLDED];
-
state = safe_state->re_state;
encoding = state->encoding;
locale_info = state->locale_info;
@@ -6256,7 +6283,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
f_pos = 0;
}
- if (s_pos < length && same_char_ign(encoding, locale_info,
+ if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
values[s_pos], folded[f_pos])) {
++s_pos;
++f_pos;
@@ -6297,7 +6324,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
Py_ssize_t length;
Py_ssize_t s_pos;
Py_UCS4 folded[RE_MAX_FOLDED];
-
state = safe_state->re_state;
encoding = state->encoding;
locale_info = state->locale_info;
@@ -6331,7 +6357,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
f_pos = 0;
}
- if (s_pos < length && same_char_ign(encoding, locale_info,
+ if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
values[length - s_pos - 1], folded[folded_len - f_pos - 1])) {
++s_pos;
++f_pos;
@@ -7032,8 +7058,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD(RE_State* state, RE_NextNode* next,
f_pos = 0;
}
- if (!same_char_ign(encoding, locale_info, folded[f_pos],
- values[s_pos]))
+ if (!same_char_ign(encoding, locale_info, values[s_pos],
+ folded[f_pos]))
return RE_ERROR_FAILURE;
++s_pos;
@@ -7105,8 +7131,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD_REV(RE_State* state, RE_NextNode*
f_pos = 0;
}
- if (!same_char_ign(encoding, locale_info, folded[folded_len - f_pos -
- 1], values[length - s_pos - 1]))
+ if (!same_char_ign(encoding, locale_info, values[length - s_pos - 1],
+ folded[folded_len - f_pos - 1]))
return RE_ERROR_FAILURE;
++s_pos;
@@ -13522,8 +13548,8 @@ advance:
if (folded_pos < folded_len && same_char_ign(encoding,
locale_info,
- folded[folded_pos],
- gfolded[gfolded_pos])) {
+ gfolded[gfolded_pos],
+ folded[folded_pos])) {
++folded_pos;
++gfolded_pos;
} else if (node->status & RE_STATUS_FUZZY) {
@@ -13626,8 +13652,8 @@ advance:
}
if (folded_pos > 0 && same_char_ign(encoding, locale_info,
- folded[folded_pos - 1],
- gfolded[gfolded_pos - 1])) {
+ gfolded[gfolded_pos - 1],
+ folded[folded_pos - 1])) {
--folded_pos;
--gfolded_pos;
} else if (node->status & RE_STATUS_FUZZY) {
@@ -14194,7 +14220,7 @@ advance:
}
if (folded_pos < folded_len && same_char_ign(encoding,
- locale_info, folded[folded_pos], values[string_pos])) {
+ locale_info, values[string_pos], folded[folded_pos])) {
++string_pos;
++folded_pos;
@@ -14306,7 +14332,7 @@ advance:
}
if (folded_pos > 0 && same_char_ign(encoding, locale_info,
- folded[folded_pos - 1], values[string_pos - 1])) {
+ values[string_pos - 1], folded[folded_pos - 1])) {
--string_pos;
--folded_pos;
@@ -15540,6 +15566,7 @@ backtrack:
RE_Node* repeated;
RE_Node* test;
BOOL match;
+ Py_ssize_t skip_pos;
BOOL m;
size_t index;
TRACE(("%s\n", re_op_text[bt_data->op]))
@@ -15569,7 +15596,7 @@ backtrack:
index = node->values[0];
match = FALSE;
-
+ skip_pos = -1;
if (test->status & RE_STATUS_FUZZY) {
for (;;) {
RE_Position next_position;
@@ -15866,6 +15893,7 @@ backtrack:
if (!is_repeat_guarded(safe_state, index, pos,
RE_STATUS_TAIL)) {
match = TRUE;
+ skip_pos = new_pos;
break;
}
}
@@ -15923,6 +15951,7 @@ backtrack:
if (!is_repeat_guarded(safe_state, index, pos,
RE_STATUS_TAIL)) {
match = TRUE;
+ skip_pos = new_pos;
break;
}
}
@@ -16165,6 +16194,12 @@ backtrack:
}
node = node->next_1.node;
+
+ if (skip_pos >= 0) {
+ state->text_pos = skip_pos;
+ node = node->next_1.node;
+ }
+
goto advance;
} else {
/* The tail couldn't match. */
@@ -21238,7 +21273,7 @@ static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject*
break;
/* Don't bother to build a MatchObject. */
- switch (self->public_group_count) {
+ switch (self->visible_capture_count) {
case 0:
if (state.reverse) {
b = state.text_pos;
@@ -22981,6 +23016,7 @@ Py_LOCAL_INLINE(int) build_FUZZY(RE_CompileArgs* args) {
args->is_fuzzy = TRUE;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
++args->code;
@@ -23021,15 +23057,16 @@ Py_LOCAL_INLINE(int) build_ATOMIC(RE_CompileArgs* args) {
return RE_ERROR_ILLEGAL;
args->code = subargs.code;
- ++args->code;
-
- /* Check the subpattern. */
args->min_width += subargs.min_width;
args->has_captures |= subargs.has_captures;
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
+ ++args->code;
+
+ /* Check the subpattern. */
if (subargs.has_groups)
atomic_node->status |= RE_STATUS_HAS_GROUPS;
@@ -23142,6 +23179,7 @@ Py_LOCAL_INLINE(int) build_BRANCH(RE_CompileArgs* args) {
return RE_ERROR_ILLEGAL;
args->code = subargs.code;
+ args->visible_capture_count = subargs.visible_capture_count;
++args->code;
args->min_width += min_width;
@@ -23190,6 +23228,7 @@ Py_LOCAL_INLINE(int) build_CALL_REF(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
++args->code;
@@ -23286,14 +23325,15 @@ Py_LOCAL_INLINE(int) build_CONDITIONAL(RE_CompileArgs* args) {
return RE_ERROR_ILLEGAL;
args->code = subargs.code;
- ++args->code;
-
- /* Check the lookaround subpattern. */
args->has_captures |= subargs.has_captures;
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
+ ++args->code;
+
+ /* Check the lookaround subpattern. */
if (subargs.has_groups)
test_node->status |= RE_STATUS_HAS_GROUPS;
@@ -23321,6 +23361,7 @@ Py_LOCAL_INLINE(int) build_CONDITIONAL(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
min_width = subargs.min_width;
@@ -23349,6 +23390,7 @@ Py_LOCAL_INLINE(int) build_CONDITIONAL(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
min_width = min_ssize_t(min_width, subargs.min_width);
@@ -23427,6 +23469,10 @@ Py_LOCAL_INLINE(int) build_GROUP(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= TRUE;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
+
+ if (!args->in_define)
+ ++args->visible_capture_count;
++args->code;
@@ -23508,6 +23554,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
start_node->values[0] = group;
subargs = *args;
+ subargs.in_define = TRUE;
status = build_sequence(&subargs);
if (status != RE_ERROR_SUCCESS)
return status;
@@ -23517,6 +23564,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
min_width = subargs.min_width;
@@ -23540,6 +23588,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
args->code = subargs.code;
args->has_captures |= subargs.has_captures;
args->is_fuzzy |= subargs.is_fuzzy;
+ args->visible_capture_count = subargs.visible_capture_count;
if (group == 0) {
/* Join the 2 branches end-to-end and bypass it. The sequence
@@ -23552,6 +23601,7 @@ Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
} else {
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
min_width = min_ssize_t(min_width, subargs.min_width);
@@ -23624,6 +23674,7 @@ Py_LOCAL_INLINE(int) build_LOOKAROUND(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
if (subargs.has_groups)
lookaround_node->status |= RE_STATUS_HAS_GROUPS;
@@ -23760,6 +23811,7 @@ Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
++args->code;
@@ -23805,6 +23857,7 @@ Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) {
args->is_fuzzy |= subargs.is_fuzzy;
args->has_groups |= subargs.has_groups;
args->has_repeats = TRUE;
+ args->visible_capture_count = subargs.visible_capture_count;
++args->code;
@@ -24303,6 +24356,8 @@ Py_LOCAL_INLINE(BOOL) compile_to_nodes(RE_CODE* code, RE_CODE* end_code,
args.repeat_depth = 0;
args.is_fuzzy = FALSE;
args.within_fuzzy = FALSE;
+ args.visible_capture_count = 0;
+ args.in_define = FALSE;
status = build_sequence(&args);
if (status == RE_ERROR_ILLEGAL)
set_error(RE_ERROR_ILLEGAL, NULL);
@@ -24314,6 +24369,7 @@ Py_LOCAL_INLINE(BOOL) compile_to_nodes(RE_CODE* code, RE_CODE* end_code,
pattern->is_fuzzy = args.is_fuzzy;
pattern->do_search_start = TRUE;
pattern->start_node = args.start;
+ pattern->visible_capture_count = args.visible_capture_count;
/* Optimise the pattern. */
if (!optimise_pattern(pattern))
@@ -24548,6 +24604,7 @@ static PyObject* re_compile(PyObject* self_, PyObject* args) {
self->repeat_count = 0;
self->true_group_count = 0;
self->public_group_count = public_group_count;
+ self->visible_capture_count = 0;
self->group_end_index = 0;
self->groupindex = groupindex;
self->indexgroup = indexgroup;
diff --git a/Python2/_regex_core.py b/Python2/_regex_core.py
index 9a79e08..b101340 100644
--- a/Python2/_regex_core.py
+++ b/Python2/_regex_core.py
@@ -286,10 +286,14 @@ def _fold_case(info, string):
return _regex.fold_case(flags, string)
-def is_cased(info, char):
+def is_cased_i(info, char):
"Checks whether a character is cased."
return len(_regex.get_all_cases(info.flags, char)) > 1
+def is_cased_f(flags, char):
+ "Checks whether a character is cased."
+ return len(_regex.get_all_cases(flags, char)) > 1
+
def _compile_firstset(info, fs):
"Compiles the firstset for the pattern."
reverse = bool(info.flags & REVERSE)
@@ -314,7 +318,7 @@ def _check_firstset(info, reverse, fs):
# if i.case_flags:
# if isinstance(i, Character):
-# if is_cased(info, i.value):
+# if is_cased_i(info, i.value):
# return []
# elif isinstance(i, SetBase):
# return []
@@ -1891,9 +1895,6 @@ class RegexBase(object):
def compile(self, reverse=False, fuzzy=False):
return self._compile(reverse, fuzzy)
- def dump(self, indent, reverse):
- self._dump(indent, reverse)
-
def is_empty(self):
return False
@@ -1930,7 +1931,7 @@ class ZeroWidthBase(RegexBase):
flags |= REVERSE_OP
return [(self._opcode, flags)]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%s%s %s" % (INDENT * indent, self._op_name,
POS_TEXT[self.positive])
@@ -1950,7 +1951,7 @@ class Any(RegexBase):
flags |= FUZZY_OP
return [(self._opcode[reverse], flags)]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%s%s" % (INDENT * indent, self._op_name)
def max_width(self):
@@ -2003,7 +2004,7 @@ class Atomic(RegexBase):
return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +
[(OP.END, )])
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sATOMIC" % (INDENT * indent)
self.subpattern.dump(indent + 1, reverse)
@@ -2114,7 +2115,7 @@ class Branch(RegexBase):
return code
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sBRANCH" % (INDENT * indent)
self.branches[0].dump(indent + 1, reverse)
for b in self.branches[1 : ]:
@@ -2444,7 +2445,7 @@ class CallGroup(RegexBase):
def _compile(self, reverse, fuzzy):
return [(OP.GROUP_CALL, self.call_ref)]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sGROUP_CALL %s" % (INDENT * indent, self.group)
def __eq__(self, other):
@@ -2517,7 +2518,7 @@ class Character(RegexBase):
return code.compile(reverse, fuzzy)
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
display = repr(unichr(self.value)).lstrip("bu")
print "%sCHARACTER %s %s%s" % (INDENT * indent,
POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])
@@ -2605,7 +2606,7 @@ class Conditional(RegexBase):
return code
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sGROUP_EXISTS %s" % (INDENT * indent, self.group)
self.yes_item.dump(indent + 1, reverse)
if not self.no_item.is_empty():
@@ -2740,7 +2741,7 @@ class Fuzzy(RegexBase):
return ([(OP.FUZZY, flags) + tuple(arguments)] +
self.subpattern.compile(reverse, True) + [(OP.END,)])
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
constraints = self._constraints_to_string()
if constraints:
constraints = " " + constraints
@@ -2799,7 +2800,7 @@ class Grapheme(RegexBase):
return grapheme_matcher.compile(reverse, fuzzy)
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sGRAPHEME" % (INDENT * indent)
def max_width(self):
@@ -2864,7 +2865,7 @@ class GreedyRepeat(RegexBase):
return ([tuple(repeat)] + subpattern + [(OP.END, )])
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
if self.max_count is None:
limit = "INF"
else:
@@ -2962,7 +2963,7 @@ class Group(RegexBase):
return code
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
group = self.group
if group < 0:
group = private_groups[group]
@@ -3026,7 +3027,7 @@ class LookAround(RegexBase):
return ([(OP.LOOKAROUND, int(self.positive), int(not self.behind))] +
self.subpattern.compile(self.behind) + [(OP.END, )])
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sLOOK%s %s" % (INDENT * indent, self._dir_text[self.behind],
POS_TEXT[self.positive])
self.subpattern.dump(indent + 1, self.behind)
@@ -3106,7 +3107,7 @@ class LookAroundConditional(RegexBase):
return code
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print("%sCONDITIONAL %s %s" % (INDENT * indent,
self._dir_text[self.behind], POS_TEXT[self.positive]))
self.subpattern.dump(indent + 1, self.behind)
@@ -3177,7 +3178,7 @@ class Property(RegexBase):
flags |= FUZZY_OP
return [(self._opcode[self.case_flags, reverse], flags, self.value)]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
prop = PROPERTY_NAMES[self.value >> 16]
name, value = prop[0], prop[1][self.value & 0xFFFF]
print "%sPROPERTY %s %s:%s%s" % (INDENT * indent,
@@ -3259,7 +3260,7 @@ class Range(RegexBase):
return [(self._opcode[self.case_flags, reverse], flags, self.lower,
self.upper)]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
display_lower = repr(unichr(self.lower)).lstrip("bu")
display_upper = repr(unichr(self.upper)).lstrip("bu")
print "%sRANGE %s %s %s%s" % (INDENT * indent, POS_TEXT[self.positive],
@@ -3310,7 +3311,7 @@ class RefGroup(RegexBase):
flags |= FUZZY_OP
return [(self._opcode[self.case_flags, reverse], flags, self.group)]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sREF_GROUP %s%s" % (INDENT * indent, self.group,
CASE_TEXT[self.case_flags])
@@ -3355,7 +3356,7 @@ class Sequence(RegexBase):
if s.case_flags != case_flags:
# Different case sensitivity, so flush, unless neither the
# previous nor the new character are cased.
- if s.case_flags or is_cased(info, s.value):
+ if s.case_flags or is_cased_i(info, s.value):
Sequence._flush_characters(info, characters,
case_flags, items)
@@ -3366,7 +3367,7 @@ class Sequence(RegexBase):
if s.case_flags != case_flags:
# Different case sensitivity, so flush, unless the neither
# the previous nor the new string are cased.
- if s.case_flags or any(is_cased(info, c) for c in
+ if s.case_flags or any(is_cased_i(info, c) for c in
characters):
Sequence._flush_characters(info, characters,
case_flags, items)
@@ -3423,7 +3424,7 @@ class Sequence(RegexBase):
return code
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
for s in self.items:
s.dump(indent, reverse)
@@ -3434,16 +3435,82 @@ class Sequence(RegexBase):
# Disregard case_flags if all of the characters are case-less.
if case_flags & IGNORECASE:
- if not any(is_cased(info, c) for c in characters):
+ if not any(is_cased_i(info, c) for c in characters):
case_flags = NOCASE
- if len(characters) == 1:
- items.append(Character(characters[0], case_flags=case_flags))
+ if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:
+ literals = Sequence._fix_full_casefold(characters)
+
+ for item in literals:
+ chars = item.characters
+
+ if len(chars) == 1:
+ items.append(Character(chars[0], case_flags=item.case_flags))
+ else:
+ items.append(String(chars, case_flags=item.case_flags))
else:
- items.append(String(characters, case_flags=case_flags))
+ if len(characters) == 1:
+ items.append(Character(characters[0], case_flags=case_flags))
+ else:
+ items.append(String(characters, case_flags=case_flags))
characters[:] = []
+ @staticmethod
+ def _fix_full_casefold(characters):
+ # Split a literal needing full case-folding into chunks that need it
+ # and chunks that can use simple case-folding, which is faster.
+ expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in
+ _regex.get_expand_on_folding()]
+ string = _regex.fold_case(FULL_CASE_FOLDING, u''.join(unichr(c)
+ for c in characters)).lower()
+ chunks = []
+
+ for e in expanded:
+ found = string.find(e)
+
+ while found >= 0:
+ chunks.append((found, found + len(e)))
+ found = string.find(e, found + 1)
+
+ pos = 0
+ literals = []
+
+ for start, end in Sequence._merge_chunks(chunks):
+ if pos < start:
+ literals.append(Literal(characters[pos : start],
+ case_flags=IGNORECASE))
+
+ literals.append(Literal(characters[start : end],
+ case_flags=FULLIGNORECASE))
+ pos = end
+
+ if pos < len(characters):
+ literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))
+
+ return literals
+
+ @staticmethod
+ def _merge_chunks(chunks):
+ if len(chunks) < 2:
+ return chunks
+
+ chunks.sort()
+
+ start, end = chunks[0]
+ new_chunks = []
+
+ for s, e in chunks[1 : ]:
+ if s <= end:
+ end = max(end, e)
+ else:
+ new_chunks.append((start, end))
+ start, end = s, e
+
+ new_chunks.append((start, end))
+
+ return new_chunks
+
def is_empty(self):
return all(i.is_empty() for i in self.items)
@@ -3509,7 +3576,7 @@ class SetBase(RegexBase):
return code
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%s%s %s%s" % (INDENT * indent, self._op_name,
POS_TEXT[self.positive], CASE_TEXT[self.case_flags])
for i in self.items:
@@ -3789,7 +3856,7 @@ class String(RegexBase):
return [(self._opcode[self.case_flags, reverse], flags,
len(self.folded_characters)) + self.folded_characters]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
display = repr("".join(unichr(c) for c in self.characters)).lstrip("bu")
print "%sSTRING %s%s" % (INDENT * indent, display,
CASE_TEXT[self.case_flags])
@@ -3801,11 +3868,11 @@ class String(RegexBase):
return 0, self
class Literal(String):
- def _dump(self, indent, reverse):
- for c in self.characters:
- display = repr(unichr(c)).lstrip("bu")
- print "%sCHARACTER MATCH %s%s" % (INDENT * indent, display,
- CASE_TEXT[self.case_flags])
+ def dump(self, indent, reverse):
+ literal = ''.join(unichr(c) for c in self.characters)
+ display = repr(literal).lstrip("bu")
+ print "%sLITERAL MATCH %s%s" % (INDENT * indent, display,
+ CASE_TEXT[self.case_flags])
class StringSet(RegexBase):
_opcode = {(NOCASE, False): OP.STRING_SET, (IGNORECASE, False):
@@ -3862,7 +3929,7 @@ class StringSet(RegexBase):
return [(self._opcode[case_flags, reverse], index, min_len,
max_len)]
- def _dump(self, indent, reverse):
+ def dump(self, indent, reverse):
print "%sSTRING_SET %s%s" % (INDENT * indent, self.name,
CASE_TEXT[self.case_flags])
diff --git a/Python2/regex.py b/Python2/regex.py
index b34a14c..b29cbbd 100644
--- a/Python2/regex.py
+++ b/Python2/regex.py
@@ -239,7 +239,7 @@ __all__ = ["compile", "escape", "findall", "finditer", "fullmatch", "match",
"U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X", "VERBOSE", "W",
"WORD", "error", "Regex"]
-__version__ = "2.4.113"
+__version__ = "2.4.117"
# --------------------------------------------------------------------
# Public interface.
diff --git a/Python2/test_regex.py b/Python2/test_regex.py
index 982eed7..b1fd510 100644
--- a/Python2/test_regex.py
+++ b/Python2/test_regex.py
@@ -3717,6 +3717,12 @@ thing
self.assertEqual(regex.search(r'a?yz', 'xxxxyz', flags=regex.FULLCASE |
regex.IGNORECASE).span(), (4, 6))
+ # Hg issue 230: Is it a bug of (?(DEFINE)...)
+ self.assertEqual(regex.findall(r'(?:(?![a-d]).)+', 'abcdefgh'),
+ ['efgh'])
+ self.assertEqual(regex.findall(r'''(?(DEFINE)(?P<mydef>(?:(?![a-d]).)))(?&mydef)+''',
+ 'abcdefgh'), ['efgh'])
+
def test_subscripted_captures(self):
self.assertEqual(regex.match(r'(?P<x>.)+',
'abc').expandf('{0} {0[0]} {0[-1]}'), 'abc abc abc')
diff --git a/Python3/_regex.c b/Python3/_regex.c
index ad468b6..3a6648c 100644
--- a/Python3/_regex.c
+++ b/Python3/_regex.c
@@ -640,6 +640,7 @@ typedef struct PatternObject {
RE_Node* start_test;
size_t true_group_count; /* The true number of capture groups. */
size_t public_group_count; /* The number of public capture groups. */
+ size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
size_t repeat_count; /* The number of repeats. */
Py_ssize_t group_end_index; /* The number of group closures. */
PyObject* groupindex;
@@ -735,6 +736,7 @@ typedef struct RE_CompileArgs {
RE_Node* start; /* The start node. */
RE_Node* end; /* The end node. */
size_t repeat_depth; /* The nesting depth of the repeat. */
+ size_t visible_capture_count; /* The number of capture groups that are visible (not hidden in (?(DEFINE)...). */
BOOL forward; /* Whether it's a forward (not reverse) pattern. */
BOOL visible_captures; /* Whether all of the captures will be visible. */
BOOL has_captures; /* Whether the pattern has capture groups. */
@@ -742,6 +744,7 @@ typedef struct RE_CompileArgs {
BOOL within_fuzzy; /* Whether the subpattern is within a fuzzy section. */
BOOL has_groups; /* Whether the subpattern contains captures. */
BOOL has_repeats; /* Whether the subpattern contains repeats. */
+ BOOL in_define; /* Whether we're in (?(DEFINE)...). */
} RE_CompileArgs;
/* The string slices which will be concatenated to make the result string of
@@ -2415,6 +2418,31 @@ Py_LOCAL_INLINE(BOOL) same_char_ign(RE_EncodingTable* encoding, RE_LocaleInfo*
return FALSE;
}
+/* Checks whether 2 characters are the same, ignoring case. The first character
+ * is already case-folded or is a possible Turkic 'I'.
+ */
+Py_LOCAL_INLINE(BOOL) same_char_ign_turkic(RE_EncodingTable* encoding,
+ RE_LocaleInfo* locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
+ int count;
+ Py_UCS4 cases[RE_MAX_CASES];
+ int i;
+
+ if (ch1 == ch2)
+ return TRUE;
+
+ if (!encoding->possible_turkic(locale_info, ch1))
+ return FALSE;
+
+ count = encoding->all_turkic_i(locale_info, ch1, cases);
+
+ for (i = 1; i < count; i++) {
+ if (cases[i] == ch2)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
/* Wrapper for calling 'same_char' via a pointer. */
static BOOL same_char_ign_wrapper(RE_EncodingTable* encoding, RE_LocaleInfo*
locale_info, Py_UCS4 ch1, Py_UCS4 ch2) {
@@ -6215,7 +6243,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
Py_ssize_t length;
Py_ssize_t s_pos;
Py_UCS4 folded[RE_MAX_FOLDED];
-
state = safe_state->re_state;
encoding = state->encoding;
locale_info = state->locale_info;
@@ -6250,7 +6277,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
f_pos = 0;
}
- if (s_pos < length && same_char_ign(encoding, locale_info,
+ if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
values[s_pos], folded[f_pos])) {
++s_pos;
++f_pos;
@@ -6291,7 +6318,6 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
Py_ssize_t length;
Py_ssize_t s_pos;
Py_UCS4 folded[RE_MAX_FOLDED];
-
state = safe_state->re_state;
encoding = state->encoding;
locale_info = state->locale_info;
@@ -6325,7 +6351,7 @@ Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
f_pos = 0;
}
- if (s_pos < length && same_char_ign(encoding, locale_info,
+ if (s_pos < length && same_char_ign_turkic(encoding, locale_info,
values[length - s_pos - 1], folded[folded_len - f_pos - 1])) {
++s_pos;
++f_pos;
@@ -7026,8 +7052,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD(RE_State* state, RE_NextNode* next,
f_pos = 0;
}
- if (!same_char_ign(encoding, locale_info, folded[f_pos],
- values[s_pos]))
+ if (!same_char_ign(encoding, locale_info, values[s_pos],
+ folded[f_pos]))
return RE_ERROR_FAILURE;
++s_pos;
@@ -7099,8 +7125,8 @@ Py_LOCAL_INLINE(int) try_match_STRING_FLD_REV(RE_State* state, RE_NextNode*
f_pos = 0;
}
- if (!same_char_ign(encoding, locale_info, folded[folded_len - f_pos -
- 1], values[length - s_pos - 1]))
+ if (!same_char_ign(encoding, locale_info, values[length - s_pos - 1],
+ folded[folded_len - f_pos - 1]))
return RE_ERROR_FAILURE;
++s_pos;
@@ -13551,8 +13577,8 @@ advance:
if (folded_pos < folded_len && same_char_ign(encoding,
locale_info,
- folded[folded_pos],
- gfolded[gfolded_pos])) {
+ gfolded[gfolded_pos],
+ folded[folded_pos])) {
++folded_pos;
++gfolded_pos;
} else if (node->status & RE_STATUS_FUZZY) {
@@ -13655,8 +13681,8 @@ advance:
}
if (folded_pos > 0 && same_char_ign(encoding, locale_info,
- folded[folded_pos - 1],
- gfolded[gfolded_pos - 1])) {
+ gfolded[gfolded_pos - 1],
+ folded[folded_pos - 1])) {
--folded_pos;
--gfolded_pos;
} else if (node->status & RE_STATUS_FUZZY) {
@@ -14223,7 +14249,7 @@ advance:
}
if (folded_pos < folded_len && same_char_ign(encoding,
- locale_info, folded[folded_pos], values[string_pos])) {
+ locale_info, values[string_pos], folded[folded_pos])) {
++string_pos;
++folded_pos;
@@ -14335,7 +14361,7 @@ advance:
}
if (folded_pos > 0 && same_char_ign(encoding, locale_info,
- folded[folded_pos - 1], values[string_pos - 1])) {
+ values[string_pos - 1], folded[folded_pos - 1])) {
--string_pos;
--folded_pos;
@@ -15569,6 +15595,7 @@ backtrack:
RE_Node* repeated;
RE_Node* test;
BOOL match;
+ Py_ssize_t skip_pos;
BOOL m;
size_t index;
TRACE(("%s\n", re_op_text[bt_data->op]))
@@ -15598,7 +15625,7 @@ backtrack:
index = node->values[0];
match = FALSE;
-
+ skip_pos = -1;
if (test->status & RE_STATUS_FUZZY) {
for (;;) {
RE_Position next_position;
@@ -15895,6 +15922,7 @@ backtrack:
if (!is_repeat_guarded(safe_state, index, pos,
RE_STATUS_TAIL)) {
match = TRUE;
+ skip_pos = new_pos;
break;
}
}
@@ -15952,6 +15980,7 @@ backtrack:
if (!is_repeat_guarded(safe_state, index, pos,
RE_STATUS_TAIL)) {
match = TRUE;
+ skip_pos = new_pos;
break;
}
}
@@ -16194,6 +16223,12 @@ backtrack:
}
node = node->next_1.node;
+
+ if (skip_pos >= 0) {
+ state->text_pos = skip_pos;
+ node = node->next_1.node;
+ }
+
goto advance;
} else {
/* The tail couldn't match. */
@@ -21168,7 +21203,7 @@ static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject*
break;
/* Don't bother to build a MatchObject. */
- switch (self->public_group_count) {
+ switch (self->visible_capture_count) {
case 0:
if (state.reverse) {
b = state.text_pos;
@@ -22898,6 +22933,7 @@ Py_LOCAL_INLINE(int) build_FUZZY(RE_CompileArgs* args) {
args->is_fuzzy = TRUE;
args->has_groups |= subargs.has_groups;
args->has_repeats |= subargs.has_repeats;
+ args->visible_capture_count = subargs.visible_capture_count;
... 549 lines suppressed ...
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-regex.git
More information about the Python-modules-commits
mailing list