diff -r e390e60fcb20 Lib/base64.py
--- a/Lib/base64.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/base64.py Sun Jun 29 22:15:16 2008 +0200
@@ -39,7 +39,7 @@ def _translate(s, altchars):
return s.translate(translation)
-
+
# Base64 encoding/decoding uses binascii
def b64encode(s, altchars=None):
@@ -126,7 +126,7 @@ def urlsafe_b64decode(s):
return b64decode(s, b'-_')
-
+
# Base32 encoding/decoding must be done in Python
_b32alphabet = {
0: b'A', 9: b'J', 18: b'S', 27: b'3',
@@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=N
# characters because this will tell us how many null bytes to remove from
# the end of the decoded string.
padchars = 0
- mo = re.search('(?P[=]*)$', s)
+ mo = re.search(b'(?P[=]*)$', s)
if mo:
padchars = len(mo.group('pad'))
if padchars > 0:
@@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=N
return b''.join(parts)
-
+
# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
# lowercase. The RFC also recommends against accepting input case
# insensitively.
@@ -291,12 +291,12 @@ def b16decode(s, casefold=False):
raise TypeError("expected bytes, not %s" % s.__class__.__name__)
if casefold:
s = s.upper()
- if re.search('[^0-9A-F]', s):
+ if re.search(b'[^0-9A-F]', s):
raise binascii.Error('Non-base16 digit found')
return binascii.unhexlify(s)
-
+
# Legacy interface. This code could be cleaned up since I don't believe
# binascii has any line length limitations. It just doesn't seem worth it
# though. The files should be opened in binary mode.
@@ -353,7 +353,7 @@ def decodestring(s):
return binascii.a2b_base64(s)
-
+
# Usable as a script...
def main():
"""Small main program"""
diff -r e390e60fcb20 Lib/encodings/idna.py
--- a/Lib/encodings/idna.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/encodings/idna.py Sun Jun 29 22:15:16 2008 +0200
@@ -176,12 +176,10 @@ class Codec(codecs.Codec):
return "", 0
# IDNA allows decoding to operate on Unicode strings, too.
- if isinstance(input, bytes):
- labels = dots.split(input)
- else:
- # Force to bytes
+ if not isinstance(input, bytes):
+ # XXX obviously wrong, see #3232
input = bytes(input)
- labels = input.split(b".")
+ labels = input.split(b".")
if labels and len(labels[-1]) == 0:
trailing_dot = '.'
diff -r e390e60fcb20 Lib/py_compile.py
--- a/Lib/py_compile.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/py_compile.py Sun Jun 29 22:15:16 2008 +0200
@@ -86,7 +86,7 @@ def read_encoding(file, default):
line = f.readline()
if not line:
break
- m = re.match(r".*\bcoding:\s*(\S+)\b", line)
+ m = re.match(br".*\bcoding:\s*(\S+)\b", line)
if m:
return m.group(1).decode("ascii")
return default
diff -r e390e60fcb20 Lib/re.py
--- a/Lib/re.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/re.py Sun Jun 29 22:15:16 2008 +0200
@@ -64,11 +64,18 @@ resulting RE will match the second chara
\Z Matches only at the end of the string.
\b Matches the empty string, but only at the start or end of a word.
\B Matches the empty string, but not at the start or end of a word.
- \d Matches any decimal digit; equivalent to the set [0-9].
- \D Matches any non-digit character; equivalent to the set [^0-9].
+ \d Matches any decimal digit; equivalent to the set [0-9] in
+ bytes patterns or string patterns with the ASCII flag.
+ In string patterns without the ASCII flag, it will match the whole
+ range of Unicode digits.
+ \D Matches any non-digit character; equivalent to [^\d].
\s Matches any whitespace character; equivalent to [ \t\n\r\f\v].
\S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
- \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
+ \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
+ in bytes patterns or string patterns with the ASCII flag.
+ In string patterns without the ASCII flag, it will match the
+ range of Unicode alphanumeric characters (letters plus digits
+ plus underscore).
With LOCALE, it will match the set [0-9_] plus characters defined
as letters for the current locale.
\W Matches the complement of \w.
@@ -87,6 +94,12 @@ This module exports the following functi
escape Backslash all non-alphanumerics in a string.
Some of the functions in this module takes flags as optional parameters:
+ A ASCII For string patterns, make \w, \W, \b, \B, \d, \D
+ match the corresponding ASCII character categories
+ (rather than the whole Unicode categories, which is the
+ default).
+ For bytes patterns, this flag is the only available
+ behaviour and needn't be specified.
I IGNORECASE Perform case-insensitive matching.
L LOCALE Make \w, \W, \b, \B, dependent on the current locale.
M MULTILINE "^" matches the beginning of lines (after a newline)
@@ -95,7 +108,8 @@ Some of the functions in this module tak
as the end of the string.
S DOTALL "." matches any character at all, including the newline.
X VERBOSE Ignore whitespace and comments for nicer looking RE's.
- U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale.
+ U UNICODE For compatibility only. Ignored for string patterns (it
+ is the default), and forbidden for bytes patterns.
This module also defines an exception 'error'.
@@ -107,16 +121,17 @@ import sre_parse
# public symbols
__all__ = [ "match", "search", "sub", "subn", "split", "findall",
- "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
- "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
+ "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X",
+ "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
"UNICODE", "error" ]
__version__ = "2.2.1"
# flags
+A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
-U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
+U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
diff -r e390e60fcb20 Lib/sre_constants.py
--- a/Lib/sre_constants.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/sre_constants.py Sun Jun 29 22:15:16 2008 +0200
@@ -207,9 +207,10 @@ SRE_FLAG_LOCALE = 4 # honour system loca
SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
-SRE_FLAG_UNICODE = 32 # use unicode locale
+SRE_FLAG_UNICODE = 32 # use unicode "locale"
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
SRE_FLAG_DEBUG = 128 # debugging
+SRE_FLAG_ASCII = 256 # use ascii "locale"
# flags for INFO primitive
SRE_INFO_PREFIX = 1 # has prefix
diff -r e390e60fcb20 Lib/sre_parse.py
--- a/Lib/sre_parse.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/sre_parse.py Sun Jun 29 22:15:16 2008 +0200
@@ -200,7 +200,7 @@ class Tokenizer:
except IndexError:
raise error("bogus escape (end of line)")
if isinstance(self.string, bytes):
- char = chr(c)
+ c = chr(c)
char = char + c
self.index = self.index + len(char)
self.next = char
@@ -672,9 +672,22 @@ def _parse(source, state):
return subpattern
+def fix_flags(src, flags):
+ # Check and fix flags according to the type of pattern (str or bytes)
+ if isinstance(src, str):
+ if not flags & SRE_FLAG_ASCII:
+ flags |= SRE_FLAG_UNICODE
+ elif flags & SRE_FLAG_UNICODE:
+ raise ValueError("ASCII and UNICODE flags are incompatible")
+ else:
+ if flags & SRE_FLAG_UNICODE:
+ raise ValueError("can't use UNICODE flag with a bytes pattern")
+ return flags
+
def parse(str, flags=0, pattern=None):
# parse 're' pattern into list of (opcode, argument) tuples
+ flags = fix_flags(str, flags)
source = Tokenizer(str)
if pattern is None:
diff -r e390e60fcb20 Lib/tarfile.py
--- a/Lib/tarfile.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/tarfile.py Sun Jun 29 22:15:16 2008 +0200
@@ -1368,7 +1368,7 @@ class TarInfo(object):
# "%d %s=%s\n" % (length, keyword, value). length is the size
# of the complete record including the length field itself and
# the newline. keyword and value are both UTF-8 encoded strings.
- regex = re.compile(r"(\d+) ([^=]+)=", re.U)
+ regex = re.compile(br"(\d+) ([^=]+)=")
pos = 0
while True:
match = regex.match(buf, pos)
diff -r e390e60fcb20 Lib/test/re_tests.py
--- a/Lib/test/re_tests.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/re_tests.py Sun Jun 29 22:15:17 2008 +0200
@@ -661,14 +661,10 @@ 123""", SUCCEED, 'found', 'abc'),
('^([ab]*?)(?<!(a))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
]
-try:
- u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'")
-except SyntaxError:
- pass
-else:
- tests.extend([
+u = '\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'
+tests.extend([
# bug 410271: \b broken under locales
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
(r'(?u)\w', u, SUCCEED, 'found', u),
- ])
+])
diff -r e390e60fcb20 Lib/test/test_bytes.py
--- a/Lib/test/test_bytes.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/test_bytes.py Sun Jun 29 22:15:17 2008 +0200
@@ -498,7 +498,7 @@ class ByteArrayTest(BaseBytesTest):
def by(s):
return bytearray(map(ord, s))
b = by("Hello, world")
- self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")])
+ self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")])
def test_setitem(self):
b = bytearray([1, 2, 3])
diff -r e390e60fcb20 Lib/test/test_mmap.py
--- a/Lib/test/test_mmap.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/test_mmap.py Sun Jun 29 22:15:17 2008 +0200
@@ -54,7 +54,7 @@ class MmapTests(unittest.TestCase):
m.flush()
# Test doing a regular expression match in an mmap'ed file
- match = re.search('[A-Za-z]+', m)
+ match = re.search(b'[A-Za-z]+', m)
if match is None:
self.fail('regex match on mmap failed!')
else:
diff -r e390e60fcb20 Lib/test/test_re.py
--- a/Lib/test/test_re.py Sat Jun 28 20:23:49 2008 +0200
+++ b/Lib/test/test_re.py Sun Jun 29 22:15:17 2008 +0200
@@ -82,23 +82,6 @@ class ReTests(unittest.TestCase):
'abc\ndef\n')
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
'abc\ndef\n')
-
- def test_bug_1140(self):
- # re.sub(x, y, b'') should return b'', not '', and
- # re.sub(x, y, '') should return '', not b''.
- # Also:
- # re.sub(x, y, str(x)) should return str(y), and
- # re.sub(x, y, bytes(x)) should return
- # str(y) if isinstance(y, str) else unicode(y).
- for x in 'x', b'x':
- for y in 'y', b'y':
- z = re.sub(x, y, b'')
- self.assertEqual(z, b'')
- self.assertEqual(type(z), bytes)
- #
- z = re.sub(x, y, '')
- self.assertEqual(z, '')
- self.assertEqual(type(z), str)
def test_bug_1661(self):
# Verify that flags do not get silently ignored with compiled patterns
@@ -607,8 +590,8 @@ class ReTests(unittest.TestCase):
import array
for typecode in 'bBuhHiIlLfd':
a = array.array(typecode)
- self.assertEqual(re.compile("bla").match(a), None)
- self.assertEqual(re.compile("").match(a).groups(), ())
+ self.assertEqual(re.compile(b"bla").match(a), None)
+ self.assertEqual(re.compile(b"").match(a).groups(), ())
def test_inline_flags(self):
# Bug #1700
@@ -650,6 +633,41 @@ class ReTests(unittest.TestCase):
self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
self.assertEqual(pattern.sub('#', '\n'), '#\n#')
+
+ def test_bytes_str_mixing(self):
+ # Mixing str and bytes is disallowed
+ pat = re.compile('.')
+ bpat = re.compile(b'.')
+ self.assertRaises(TypeError, pat.match, b'b')
+ self.assertRaises(TypeError, bpat.match, 'b')
+ self.assertRaises(TypeError, pat.sub, b'b', 'c')
+ self.assertRaises(TypeError, pat.sub, 'b', b'c')
+ self.assertRaises(TypeError, pat.sub, b'b', b'c')
+ self.assertRaises(TypeError, bpat.sub, b'b', 'c')
+ self.assertRaises(TypeError, bpat.sub, 'b', b'c')
+ self.assertRaises(TypeError, bpat.sub, 'b', 'c')
+
+ def test_unicode_flag(self):
+ # String patterns
+ for flags in (0, re.UNICODE):
+ pat = re.compile('\xc0', flags | re.IGNORECASE)
+ self.assertNotEqual(pat.match('\xe0'), None)
+ pat = re.compile('\w', flags)
+ self.assertNotEqual(pat.match('\xe0'), None)
+ pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
+ self.assertEqual(pat.match('\xe0'), None)
+ pat = re.compile('\w', re.ASCII)
+ self.assertEqual(pat.match('\xe0'), None)
+ # Bytes patterns
+ for flags in (0, re.ASCII):
+ pat = re.compile(b'\xc0', re.IGNORECASE)
+ self.assertEqual(pat.match(b'\xe0'), None)
+ pat = re.compile(b'\w')
+ self.assertEqual(pat.match(b'\xe0'), None)
+ # Incompatibilities
+ self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
+ self.assertRaises(ValueError, re.compile, '\w',
+ re.UNICODE | re.ASCII)
def run_re_tests():
@@ -725,23 +743,25 @@ def run_re_tests():
else:
print('=== Failed incorrectly', t)
- # Try the match on a unicode string, and check that it
- # still succeeds.
+ # Try the match with both pattern and string converted to
+ # bytes, and check that it still succeeds.
try:
- result = obj.search(str(s, "latin-1"))
- if result is None:
- print('=== Fails on unicode match', t)
- except NameError:
- continue # 1.5.2
- except TypeError:
- continue # unicode test case
-
- # Try the match on a unicode pattern, and check that it
- # still succeeds.
- obj=re.compile(str(pattern, "latin-1"))
- result = obj.search(s)
- if result is None:
- print('=== Fails on unicode pattern match', t)
+ bpat = bytes(pattern, "ascii")
+ bs = bytes(s, "ascii")
+ except UnicodeEncodeError:
+ # skip non-ascii tests
+ pass
+ else:
+ try:
+ bpat = re.compile(bpat)
+ except Exception:
+ print('=== Fails on bytes pattern compile', t)
+ if verbose:
+ traceback.print_exc(file=sys.stdout)
+ else:
+ bytes_result = bpat.search(bs)
+ if bytes_result is None:
+ print('=== Fails on bytes pattern match', t)
# Try the match with the search area limited to the extent
# of the match and see if it still succeeds. \B will
@@ -764,10 +784,11 @@ def run_re_tests():
# Try the match with LOCALE enabled, and check that it
# still succeeds.
- obj = re.compile(pattern, re.LOCALE)
- result = obj.search(s)
- if result is None:
- print('=== Fails on locale-sensitive match', t)
+ if '(?u)' not in pattern:
+ obj = re.compile(pattern, re.LOCALE)
+ result = obj.search(s)
+ if result is None:
+ print('=== Fails on locale-sensitive match', t)
# Try the match with UNICODE locale enabled, and check
# that it still succeeds.
diff -r e390e60fcb20 Modules/_sre.c
--- a/Modules/_sre.c Sat Jun 28 20:23:49 2008 +0200
+++ b/Modules/_sre.c Sun Jun 29 22:15:17 2008 +0200
@@ -1691,7 +1691,7 @@ getstring(PyObject* string, Py_ssize_t*
/* get pointer to string buffer */
view.len = -1;
buffer = Py_TYPE(string)->tp_as_buffer;
- if (!buffer || !buffer->bf_getbuffer ||
+ if (!buffer || !buffer->bf_getbuffer ||
(*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
return NULL;
@@ -1717,7 +1717,7 @@ getstring(PyObject* string, Py_ssize_t*
if (PyBytes_Check(string) || bytes == size)
charsize = 1;
#if defined(HAVE_UNICODE)
- else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
+ else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
charsize = sizeof(Py_UNICODE);
#endif
else {
@@ -1729,7 +1729,7 @@ getstring(PyObject* string, Py_ssize_t*
*p_charsize = charsize;
if (ptr == NULL) {
- PyErr_SetString(PyExc_ValueError,
+ PyErr_SetString(PyExc_ValueError,
"Buffer is NULL");
}
return ptr;
@@ -1753,6 +1753,17 @@ state_init(SRE_STATE* state, PatternObje
ptr = getstring(string, &length, &charsize);
if (!ptr)
return NULL;
+
+ if (charsize == 1 && pattern->charsize > 1) {
+ PyErr_SetString(PyExc_TypeError,
+ "can't use a string pattern on a bytes-like object");
+ return NULL;
+ }
+ if (charsize > 1 && pattern->charsize == 1) {
+ PyErr_SetString(PyExc_TypeError,
+ "can't use a bytes pattern on a string-like object");
+ return NULL;
+ }
/* adjust boundaries */
if (start < 0)
@@ -2700,6 +2711,16 @@ _compile(PyObject* self_, PyObject* args
return NULL;
}
+ if (pattern == Py_None)
+ self->charsize = -1;
+ else {
+ Py_ssize_t p_length;
+ if (!getstring(pattern, &p_length, &self->charsize)) {
+ PyObject_DEL(self);
+ return NULL;
+ }
+ }
+
Py_INCREF(pattern);
self->pattern = pattern;
diff -r e390e60fcb20 Modules/sre.h
--- a/Modules/sre.h Sat Jun 28 20:23:49 2008 +0200
+++ b/Modules/sre.h Sun Jun 29 22:15:17 2008 +0200
@@ -30,6 +30,7 @@ typedef struct {
PyObject* pattern; /* pattern source (or None) */
int flags; /* flags used when compiling pattern source */
PyObject *weakreflist; /* List of weak references */
+ int charsize; /* pattern charsize (or -1) */
/* pattern code */
Py_ssize_t codesize;
SRE_CODE code[1];