diff -r e390e60fcb20 Lib/base64.py --- a/Lib/base64.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/base64.py Sun Jun 29 22:35:22 2008 +0200 @@ -39,7 +39,7 @@ def _translate(s, altchars): return s.translate(translation) - + # Base64 encoding/decoding uses binascii def b64encode(s, altchars=None): @@ -126,7 +126,7 @@ def urlsafe_b64decode(s): return b64decode(s, b'-_') - + # Base32 encoding/decoding must be done in Python _b32alphabet = { 0: b'A', 9: b'J', 18: b'S', 27: b'3', @@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=N # characters because this will tell us how many null bytes to remove from # the end of the decoded string. padchars = 0 - mo = re.search('(?P[=]*)$', s) + mo = re.search(b'(?P[=]*)$', s) if mo: padchars = len(mo.group('pad')) if padchars > 0: @@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=N return b''.join(parts) - + # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns # lowercase. The RFC also recommends against accepting input case # insensitively. @@ -291,12 +291,12 @@ def b16decode(s, casefold=False): raise TypeError("expected bytes, not %s" % s.__class__.__name__) if casefold: s = s.upper() - if re.search('[^0-9A-F]', s): + if re.search(b'[^0-9A-F]', s): raise binascii.Error('Non-base16 digit found') return binascii.unhexlify(s) - + # Legacy interface. This code could be cleaned up since I don't believe # binascii has any line length limitations. It just doesn't seem worth it # though. The files should be opened in binary mode. @@ -353,7 +353,7 @@ def decodestring(s): return binascii.a2b_base64(s) - + # Usable as a script... def main(): """Small main program""" diff -r e390e60fcb20 Lib/encodings/idna.py --- a/Lib/encodings/idna.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/encodings/idna.py Sun Jun 29 22:35:22 2008 +0200 @@ -176,12 +176,10 @@ class Codec(codecs.Codec): return "", 0 # IDNA allows decoding to operate on Unicode strings, too. - if isinstance(input, bytes): - labels = dots.split(input) - else: - # Force to bytes + if not isinstance(input, bytes): + # XXX obviously wrong, see #3232 input = bytes(input) - labels = input.split(b".") + labels = input.split(b".") if labels and len(labels[-1]) == 0: trailing_dot = '.' diff -r e390e60fcb20 Lib/py_compile.py --- a/Lib/py_compile.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/py_compile.py Sun Jun 29 22:35:22 2008 +0200 @@ -86,7 +86,7 @@ def read_encoding(file, default): line = f.readline() if not line: break - m = re.match(r".*\bcoding:\s*(\S+)\b", line) + m = re.match(br".*\bcoding:\s*(\S+)\b", line) if m: return m.group(1).decode("ascii") return default diff -r e390e60fcb20 Lib/re.py --- a/Lib/re.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/re.py Sun Jun 29 22:35:22 2008 +0200 @@ -64,11 +64,18 @@ resulting RE will match the second chara \Z Matches only at the end of the string. \b Matches the empty string, but only at the start or end of a word. \B Matches the empty string, but not at the start or end of a word. - \d Matches any decimal digit; equivalent to the set [0-9]. - \D Matches any non-digit character; equivalent to the set [^0-9]. + \d Matches any decimal digit; equivalent to the set [0-9] in + bytes patterns or string patterns with the ASCII flag. + In string patterns without the ASCII flag, it will match the whole + range of Unicode digits. + \D Matches any non-digit character; equivalent to [^\d]. \s Matches any whitespace character; equivalent to [ \t\n\r\f\v]. \S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v]. - \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]. + \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_] + in bytes patterns or string patterns with the ASCII flag. + In string patterns without the ASCII flag, it will match the + range of Unicode alphanumeric characters (letters plus digits + plus underscore). With LOCALE, it will match the set [0-9_] plus characters defined as letters for the current locale. \W Matches the complement of \w. @@ -87,6 +94,12 @@ This module exports the following functi escape Backslash all non-alphanumerics in a string. Some of the functions in this module takes flags as optional parameters: + A ASCII For string patterns, make \w, \W, \b, \B, \d, \D + match the corresponding ASCII character categories + (rather than the whole Unicode categories, which is the + default). + For bytes patterns, this flag is the only available + behaviour and needn't be specified. I IGNORECASE Perform case-insensitive matching. L LOCALE Make \w, \W, \b, \B, dependent on the current locale. M MULTILINE "^" matches the beginning of lines (after a newline) @@ -95,7 +108,8 @@ Some of the functions in this module tak as the end of the string. S DOTALL "." matches any character at all, including the newline. X VERBOSE Ignore whitespace and comments for nicer looking RE's. - U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale. + U UNICODE For compatibility only. Ignored for string patterns (it + is the default), and forbidden for bytes patterns. This module also defines an exception 'error'. @@ -107,16 +121,17 @@ import sre_parse # public symbols __all__ = [ "match", "search", "sub", "subn", "split", "findall", - "compile", "purge", "template", "escape", "I", "L", "M", "S", "X", - "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", + "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X", + "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", "UNICODE", "error" ] __version__ = "2.2.1" # flags +A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale" I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale -U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale +U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale" M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments diff -r e390e60fcb20 Lib/sre_constants.py --- a/Lib/sre_constants.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/sre_constants.py Sun Jun 29 22:35:22 2008 +0200 @@ -207,9 +207,10 @@ SRE_FLAG_LOCALE = 4 # honour system loca SRE_FLAG_LOCALE = 4 # honour system locale SRE_FLAG_MULTILINE = 8 # treat target as multiline string SRE_FLAG_DOTALL = 16 # treat target as a single string -SRE_FLAG_UNICODE = 32 # use unicode locale +SRE_FLAG_UNICODE = 32 # use unicode "locale" SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_ASCII = 256 # use ascii "locale" # flags for INFO primitive SRE_INFO_PREFIX = 1 # has prefix diff -r e390e60fcb20 Lib/sre_parse.py --- a/Lib/sre_parse.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/sre_parse.py Sun Jun 29 22:35:22 2008 +0200 @@ -200,7 +200,7 @@ class Tokenizer: except IndexError: raise error("bogus escape (end of line)") if isinstance(self.string, bytes): - char = chr(c) + c = chr(c) char = char + c self.index = self.index + len(char) self.next = char @@ -672,6 +672,18 @@ def _parse(source, state): return subpattern +def fix_flags(src, flags): + # Check and fix flags according to the type of pattern (str or bytes) + if isinstance(src, str): + if not flags & SRE_FLAG_ASCII: + flags |= SRE_FLAG_UNICODE + elif flags & SRE_FLAG_UNICODE: + raise ValueError("ASCII and UNICODE flags are incompatible") + else: + if flags & SRE_FLAG_UNICODE: + raise ValueError("can't use UNICODE flag with a bytes pattern") + return flags + def parse(str, flags=0, pattern=None): # parse 're' pattern into list of (opcode, argument) tuples @@ -683,6 +695,7 @@ def parse(str, flags=0, pattern=None): pattern.str = str p = _parse_sub(source, pattern, 0) + p.pattern.flags = fix_flags(str, p.pattern.flags) tail = source.get() if tail == ")": diff -r e390e60fcb20 Lib/tarfile.py --- a/Lib/tarfile.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/tarfile.py Sun Jun 29 22:35:22 2008 +0200 @@ -1368,7 +1368,7 @@ class TarInfo(object): # "%d %s=%s\n" % (length, keyword, value). length is the size # of the complete record including the length field itself and # the newline. keyword and value are both UTF-8 encoded strings. - regex = re.compile(r"(\d+) ([^=]+)=", re.U) + regex = re.compile(br"(\d+) ([^=]+)=") pos = 0 while True: match = regex.match(buf, pos) diff -r e390e60fcb20 Lib/test/re_tests.py --- a/Lib/test/re_tests.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/test/re_tests.py Sun Jun 29 22:35:22 2008 +0200 @@ -661,14 +661,10 @@ 123""", SUCCEED, 'found', 'abc'), ('^([ab]*?)(?<!(a))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), ] -try: - u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'") -except SyntaxError: - pass -else: - tests.extend([ +u = '\N{LATIN CAPITAL LETTER A WITH DIAERESIS}' +tests.extend([ # bug 410271: \b broken under locales (r'\b.\b', 'a', SUCCEED, 'found', 'a'), (r'(?u)\b.\b', u, SUCCEED, 'found', u), (r'(?u)\w', u, SUCCEED, 'found', u), - ]) +]) diff -r e390e60fcb20 Lib/test/test_bytes.py --- a/Lib/test/test_bytes.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/test/test_bytes.py Sun Jun 29 22:35:22 2008 +0200 @@ -498,7 +498,7 @@ class ByteArrayTest(BaseBytesTest): def by(s): return bytearray(map(ord, s)) b = by("Hello, world") - self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")]) + self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")]) def test_setitem(self): b = bytearray([1, 2, 3]) diff -r e390e60fcb20 Lib/test/test_mmap.py --- a/Lib/test/test_mmap.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/test/test_mmap.py Sun Jun 29 22:35:22 2008 +0200 @@ -54,7 +54,7 @@ class MmapTests(unittest.TestCase): m.flush() # Test doing a regular expression match in an mmap'ed file - match = re.search('[A-Za-z]+', m) + match = re.search(b'[A-Za-z]+', m) if match is None: self.fail('regex match on mmap failed!') else: diff -r e390e60fcb20 Lib/test/test_re.py --- a/Lib/test/test_re.py Sat Jun 28 20:23:49 2008 +0200 +++ b/Lib/test/test_re.py Sun Jun 29 22:35:22 2008 +0200 @@ -82,23 +82,6 @@ class ReTests(unittest.TestCase): 'abc\ndef\n') self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 'abc\ndef\n') - - def test_bug_1140(self): - # re.sub(x, y, b'') should return b'', not '', and - # re.sub(x, y, '') should return '', not b''. - # Also: - # re.sub(x, y, str(x)) should return str(y), and - # re.sub(x, y, bytes(x)) should return - # str(y) if isinstance(y, str) else unicode(y). - for x in 'x', b'x': - for y in 'y', b'y': - z = re.sub(x, y, b'') - self.assertEqual(z, b'') - self.assertEqual(type(z), bytes) - # - z = re.sub(x, y, '') - self.assertEqual(z, '') - self.assertEqual(type(z), str) def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns @@ -607,8 +590,8 @@ class ReTests(unittest.TestCase): import array for typecode in 'bBuhHiIlLfd': a = array.array(typecode) - self.assertEqual(re.compile("bla").match(a), None) - self.assertEqual(re.compile("").match(a).groups(), ()) + self.assertEqual(re.compile(b"bla").match(a), None) + self.assertEqual(re.compile(b"").match(a).groups(), ()) def test_inline_flags(self): # Bug #1700 @@ -650,6 +633,43 @@ class ReTests(unittest.TestCase): self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') + + def test_bytes_str_mixing(self): + # Mixing str and bytes is disallowed + pat = re.compile('.') + bpat = re.compile(b'.') + self.assertRaises(TypeError, pat.match, b'b') + self.assertRaises(TypeError, bpat.match, 'b') + self.assertRaises(TypeError, pat.sub, b'b', 'c') + self.assertRaises(TypeError, pat.sub, 'b', b'c') + self.assertRaises(TypeError, pat.sub, b'b', b'c') + self.assertRaises(TypeError, bpat.sub, b'b', 'c') + self.assertRaises(TypeError, bpat.sub, 'b', b'c') + self.assertRaises(TypeError, bpat.sub, 'b', 'c') + + def test_unicode_flag(self): + # String patterns + for flags in (0, re.UNICODE): + pat = re.compile('\xc0', flags | re.IGNORECASE) + self.assertNotEqual(pat.match('\xe0'), None) + pat = re.compile('\w', flags) + self.assertNotEqual(pat.match('\xe0'), None) + pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = re.compile('\w', re.ASCII) + self.assertEqual(pat.match('\xe0'), None) + # Bytes patterns + for flags in (0, re.ASCII): + pat = re.compile(b'\xc0', re.IGNORECASE) + self.assertEqual(pat.match(b'\xe0'), None) + pat = re.compile(b'\w') + self.assertEqual(pat.match(b'\xe0'), None) + # Incompatibilities + self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE) + self.assertRaises(ValueError, re.compile, b'(?u)\w') + self.assertRaises(ValueError, re.compile, '\w', + re.UNICODE | re.ASCII) + self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII) def run_re_tests(): @@ -725,23 +745,25 @@ def run_re_tests(): else: print('=== Failed incorrectly', t) - # Try the match on a unicode string, and check that it - # still succeeds. + # Try the match with both pattern and string converted to + # bytes, and check that it still succeeds. try: - result = obj.search(str(s, "latin-1")) - if result is None: - print('=== Fails on unicode match', t) - except NameError: - continue # 1.5.2 - except TypeError: - continue # unicode test case - - # Try the match on a unicode pattern, and check that it - # still succeeds. - obj=re.compile(str(pattern, "latin-1")) - result = obj.search(s) - if result is None: - print('=== Fails on unicode pattern match', t) + bpat = bytes(pattern, "ascii") + bs = bytes(s, "ascii") + except UnicodeEncodeError: + # skip non-ascii tests + pass + else: + try: + bpat = re.compile(bpat) + except Exception: + print('=== Fails on bytes pattern compile', t) + if verbose: + traceback.print_exc(file=sys.stdout) + else: + bytes_result = bpat.search(bs) + if bytes_result is None: + print('=== Fails on bytes pattern match', t) # Try the match with the search area limited to the extent # of the match and see if it still succeeds. \B will @@ -764,10 +786,11 @@ def run_re_tests(): # Try the match with LOCALE enabled, and check that it # still succeeds. - obj = re.compile(pattern, re.LOCALE) - result = obj.search(s) - if result is None: - print('=== Fails on locale-sensitive match', t) + if '(?u)' not in pattern: + obj = re.compile(pattern, re.LOCALE) + result = obj.search(s) + if result is None: + print('=== Fails on locale-sensitive match', t) # Try the match with UNICODE locale enabled, and check # that it still succeeds. diff -r e390e60fcb20 Modules/_sre.c --- a/Modules/_sre.c Sat Jun 28 20:23:49 2008 +0200 +++ b/Modules/_sre.c Sun Jun 29 22:35:22 2008 +0200 @@ -1691,7 +1691,7 @@ getstring(PyObject* string, Py_ssize_t* /* get pointer to string buffer */ view.len = -1; buffer = Py_TYPE(string)->tp_as_buffer; - if (!buffer || !buffer->bf_getbuffer || + if (!buffer || !buffer->bf_getbuffer || (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) { PyErr_SetString(PyExc_TypeError, "expected string or buffer"); return NULL; @@ -1717,7 +1717,7 @@ getstring(PyObject* string, Py_ssize_t* if (PyBytes_Check(string) || bytes == size) charsize = 1; #if defined(HAVE_UNICODE) - else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) + else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) charsize = sizeof(Py_UNICODE); #endif else { @@ -1729,7 +1729,7 @@ getstring(PyObject* string, Py_ssize_t* *p_charsize = charsize; if (ptr == NULL) { - PyErr_SetString(PyExc_ValueError, + PyErr_SetString(PyExc_ValueError, "Buffer is NULL"); } return ptr; @@ -1753,6 +1753,17 @@ state_init(SRE_STATE* state, PatternObje ptr = getstring(string, &length, &charsize); if (!ptr) return NULL; + + if (charsize == 1 && pattern->charsize > 1) { + PyErr_SetString(PyExc_TypeError, + "can't use a string pattern on a bytes-like object"); + return NULL; + } + if (charsize > 1 && pattern->charsize == 1) { + PyErr_SetString(PyExc_TypeError, + "can't use a bytes pattern on a string-like object"); + return NULL; + } /* adjust boundaries */ if (start < 0) @@ -2700,6 +2711,16 @@ _compile(PyObject* self_, PyObject* args return NULL; } + if (pattern == Py_None) + self->charsize = -1; + else { + Py_ssize_t p_length; + if (!getstring(pattern, &p_length, &self->charsize)) { + PyObject_DEL(self); + return NULL; + } + } + Py_INCREF(pattern); self->pattern = pattern; diff -r e390e60fcb20 Modules/sre.h --- a/Modules/sre.h Sat Jun 28 20:23:49 2008 +0200 +++ b/Modules/sre.h Sun Jun 29 22:35:22 2008 +0200 @@ -30,6 +30,7 @@ typedef struct { PyObject* pattern; /* pattern source (or None) */ int flags; /* flags used when compiling pattern source */ PyObject *weakreflist; /* List of weak references */ + int charsize; /* pattern charsize (or -1) */ /* pattern code */ Py_ssize_t codesize; SRE_CODE code[1];