python · vstinner · Sep 25, 2017 · Sep 14, 2017 · Sep 15, 2017 · Sep 21, 2017
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
@@ -34,6 +34,7 @@
 except UnicodeEncodeError:
     raise unittest.SkipTest("filename is not encodable to utf8")
 SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata")
+UTF8_BUG_XMLFILE = findfile("expat224_utf8_bug.xml", subdir="xmltestdata")
 
 SAMPLE_XML = """\
 <body>
@@ -1739,6 +1740,37 @@ def __eq__(self, other):
         self.assertIsInstance(e[0].tag, str)
         self.assertEqual(e[0].tag, 'changed')
 
+    def check_expat224_utf8_bug(self, text):
+        xml = b'<a b="%s"/>' % text
+        root = ET.XML(xml)
+        self.assertEqual(root.get('b'), text.decode('utf-8'))
+
+    def test_expat224_utf8_bug(self):
+        # bpo-31170: Expat 2.2.3 had a bug in its UTF-8 decoder.
+        # Check that Expat 2.2.4 fixed the bug.
+        #
+        # Test buffer bounds at odd and even positions.
+
+        text = b'\xc3\xa0' * 1024
+        self.check_expat224_utf8_bug(text)
+
+        text = b'x' + b'\xc3\xa0' * 1024
+        self.check_expat224_utf8_bug(text)
+
+    def test_expat224_utf8_bug_file(self):
+        with open(UTF8_BUG_XMLFILE, 'rb') as fp:
+            raw = fp.read()
+        root = ET.fromstring(raw)
+        xmlattr = root.get('b')
+
+        # "Parse" manually the XML file to extract the value of the 'b'
+        # attribute of the <a b='xxx' /> XML element
+        text = raw.decode('utf-8').strip()
+        text = text.replace('\r\n', ' ')
+        text = text[6:-4]
+        self.assertEqual(root.get('b'), text)
+
+
 
 # --------------------------------------------------------------------
 

diff --git a/Lib/test/xmltestdata/expat224_utf8_bug.xml b/Lib/test/xmltestdata/expat224_utf8_bug.xml
@@ -0,0 +1,2 @@
+<a b='01234567890123456古人咏雪抽幽思骋妍辞竞险韵偶得一编奇绝辄擅美当时流声后代是以北门之风南山之雅梁园之简黄台之赋至今为作家称述尚矣及至洛阳之卧剡溪之兴灞桥之思亦皆传为故事钱塘沈履德先生隐居西湖两峰间孤高贞洁与雪同调方大雪满天皴肤粟背之际先生乃鹿中豹舄端居闭门或扶童曳杖踏遍六桥三竺时取古人诗讽咏之合唐宋元诸名家集句成诗得二百四十章联络通穿如出一人如呵一气气立于言表格备于篇中略无掇拾补凑之形非胸次包罗壮阔笔底驱走鲍谢欧苏诸公不能为此世称王荆公为集句擅长观其在钟山对雪仅题数篇未见有此噫嘻奇矣哉亦富矣哉予慕先生有袁安之节愧不能为慧可之立乃取新集命工传写使海内同好者知先生为博古传述之士而一新世人之耳目他日必有慕潜德阐幽光而剞劂以传者余实为之执殳矣
+弘治戊午仲冬望日慈溪杨子器衵于海虞官舍序毕诗部' />
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		<a b='01234567890123456古人咏雪抽幽思骋妍辞竞险韵偶得一编奇绝辄擅美当时流声后代是以北门之风南山之雅梁园之简黄台之赋至今为作家称述尚矣及至洛阳之卧剡溪之兴灞桥之思亦皆传为故事钱塘沈履德先生隐居西湖两峰间孤高贞洁与雪同调方大雪满天皴肤粟背之际先生乃鹿中豹舄端居闭门或扶童曳杖踏遍六桥三竺时取古人诗讽咏之合唐宋元诸名家集句成诗得二百四十章联络通穿如出一人如呵一气气立于言表格备于篇中略无掇拾补凑之形非胸次包罗壮阔笔底驱走鲍谢欧苏诸公不能为此世称王荆公为集句擅长观其在钟山对雪仅题数篇未见有此噫嘻奇矣哉亦富矣哉予慕先生有袁安之节愧不能为慧可之立乃取新集命工传写使海内同好者知先生为博古传述之士而一新世人之耳目他日必有慕潜德阐幽光而剞劂以传者余实为之执殳矣
Copy link Copy Markdown Member serhiy-storchaka Sep 14, 2017 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I don't know Chinese. I used other example for reproducing this bug: https://bugs.python.org/issue31303#msg300997. Or simpler: `xml.etree.ElementTree.XML(b'<a b="' + b'x'1023 + b'\xc3\xa0"/>')` Copy link Copy Markdown Member Author vstinner* Sep 15, 2017 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. xml.etree.ElementTree.XML(b'') Ok, I also added this test. I prefer 2 tests rather than a single one :-) I like to reuse the same XML data which was used to reproduce the initial Expat bug. Copy link Copy Markdown Member serhiy-storchaka Sep 15, 2017 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I have proposed different test not because I don't know Chinese, but because it is simpler, don't need an external file, and easier extensible. For example you can test with two strings `b'<a b="' + b'\xc3\xa0'1024 + b'"/>' b'<a b="x' + b'\xc3\xa0'1024 + b'"/>'` and be sure that it covers any buffer boundaries in the range of two kilobytes. The original Chinese example works only for specific buffer size (1 KiB) and specific buffering strategy (attribute value is written from the start of the buffer). Copy link Copy Markdown Member Author vstinner Sep 15, 2017 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. For example you can test with two strings : (...) Ok, I added these two examples as well. FYI I tested and b'' (2 KB) worked well on Expat <= 2.2.3, but b'' failed (2 KB + 1). I added 4 tests. I should now be enough to test for non-regression, no? If you want more tests, IMHO they should be written in Expat, not CPython ;-) I only wanted to make sure that the fix was applied, not try all corner cases.
		弘治戊午仲冬望日慈溪杨子器衵于海虞官舍序毕诗部' />