From 05a5edbd5d62a9d202a0fb956988dd19185dbeb4 Mon Sep 17 00:00:00 2001 From: Zackery Spytz Date: Wed, 17 Jun 2020 23:50:19 -0600 Subject: [PATCH 1/4] bpo-35018: Sax parser provides no user access to lexical handlers Co-Authored-By: Jonathan Gossage --- Doc/library/xml.sax.handler.rst | 61 ++++++- Lib/test/test_sax.py | 157 +++++++++++++++++- Lib/xml/sax/handler.py | 44 +++++ .../2020-06-17-23-49-45.bpo-35018.NP5_Qk.rst | 2 + 4 files changed, 255 insertions(+), 9 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2020-06-17-23-49-45.bpo-35018.NP5_Qk.rst diff --git a/Doc/library/xml.sax.handler.rst b/Doc/library/xml.sax.handler.rst index ae0877ca90db07b..97ba93f4789aa46 100644 --- a/Doc/library/xml.sax.handler.rst +++ b/Doc/library/xml.sax.handler.rst @@ -11,12 +11,12 @@ -------------- -The SAX API defines four kinds of handlers: content handlers, DTD handlers, -error handlers, and entity resolvers. Applications normally only need to -implement those interfaces whose events they are interested in; they can -implement the interfaces in a single object or in multiple objects. Handler -implementations should inherit from the base classes provided in the module -:mod:`xml.sax.handler`, so that all methods get default implementations. +The SAX API defines five kinds of handlers: content handlers, DTD handlers, +error handlers, entity resolvers and lexical handlers. Applications normally +only need to implement those interfaces whose events they are interested in; +they can implement the interfaces in a single object or in multiple objects. +Handler implementations should inherit from the base classes provided in the +module :mod:`xml.sax.handler`, so that all methods get default implementations. .. class:: ContentHandler @@ -47,6 +47,12 @@ implementations should inherit from the base classes provided in the module application. The methods of this object control whether errors are immediately converted to exceptions or are handled in some other way. + +.. class:: LexicalHandler + + Interface used by the parser to represent low freqency events which may not + be of interest to many applications. + In addition to these classes, :mod:`xml.sax.handler` provides symbolic constants for the feature and property names. @@ -114,7 +120,7 @@ for the feature and property names. .. data:: property_lexical_handler | value: ``"http://xml.org/sax/properties/lexical-handler"`` - | data type: xml.sax.sax2lib.LexicalHandler (not supported in Python 2) + | data type: xml.sax.handler.LexicalHandler (not supported in Python 2) | description: An optional extension handler for lexical events like comments. | access: read/write @@ -413,3 +419,44 @@ the passed-in exception object. information will continue to be passed to the application. Raising an exception in this method will cause parsing to end. + +.. _lexical-handler-objects: + +LexicalHandler Objects +---------------------- +Optional SAX2 handler for lexical events. + +This handler is used to obtain lexical information about an XML +document. Lexical information includes information describing the +document encoding used and XML comments embedded in the document, as +well as section boundaries for the DTD and for any CDATA sections. +The lexical handlers are used in the same manner as content handlers. + +Set the LexicalHandler of an XMLReader by using the setProperty method +with the property identifier. + + +.. method:: LexicalHandler.comment(content) + + Reports a comment anywhere in the document (including the DTD and + outside the document element). + +.. method:: LexicalHandler.startDTD(name, public_id, system_id) + + Reports the start of the DTD declarations if the document has an + associated DTD. + +.. method:: LexicalHandler.endDTD() + + Reports the end of DTD declaration. + +.. method:: LexicalHandler.startCDATA() + + Reports the start of a CDATA marked section. + + The contents of the CDATA marked section will be reported through + the characters handler. + +.. method:: LexicalHandler.endCDATA() + + Reports the end of a CDATA marked section. diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py index bc77103641b6ffe..3dab58da5a7e2a7 100644 --- a/Lib/test/test_sax.py +++ b/Lib/test/test_sax.py @@ -13,7 +13,8 @@ from xml.sax.saxutils import XMLGenerator, escape, unescape, quoteattr, \ XMLFilterBase, prepare_input_source from xml.sax.expatreader import create_parser -from xml.sax.handler import feature_namespaces, feature_external_ges +from xml.sax.handler import (feature_namespaces, feature_external_ges, + LexicalHandler) from xml.sax.xmlreader import InputSource, AttributesImpl, AttributesNSImpl from io import BytesIO, StringIO import codecs @@ -1354,6 +1355,155 @@ def test_nsattrs_wattr(self): self.assertEqual(attrs.getQNameByName((ns_uri, "attr")), "ns:attr") +class LexicalHandlerTest(unittest.TestCase): + def setUp(self): + self.parser = None + + self.specified_version = '1.0' + self.specified_encoding = 'UTF-8' + self.specified_doctype = 'wish' + self.specified_entity_names = ('nbsp', 'source', 'target') + self.specified_comment = ('Comment in a DTD', + 'Really! You think so?') + self.test_data = StringIO() + self.test_data.write('\n'. + format(self.specified_version, + self.specified_encoding)) + self.test_data.write('\n'. + format(self.specified_comment[0])) + self.test_data.write('\n'. + format(self.specified_doctype)) + self.test_data.write('\n') + self.test_data.write('\n') + self.test_data.write('\n') + self.test_data.write('\n') + self.test_data.write('\n') + self.test_data.write('\n'. + format(self.specified_entity_names[0])) + self.test_data.write('\n'. + format(self.specified_entity_names[1])) + self.test_data.write('\n'. + format(self.specified_entity_names[2])) + self.test_data.write(']>\n') + self.test_data.write('<{}>'.format(self.specified_doctype)) + self.test_data.write('Aristotle\n') + self.test_data.write('Alexander\n') + self.test_data.write('Supplication\n') + self.test_data.write('Teach me patience!\n') + self.test_data.write('
&{};&{};&{};
\n'. + format(self.specified_entity_names[1], + self.specified_entity_names[0], + self.specified_entity_names[2])) + self.test_data.write('\n'.format(self.specified_comment[1])) + self.test_data.write('\n'.format(self.specified_doctype)) + self.test_data.seek(0) + + # Data received from handlers - to be validated + self.version = None + self.encoding = None + self.standalone = None + self.doctype = None + self.publicID = None + self.systemID = None + self.end_of_dtd = False + self.comments = [] + + def test_handlers(self): + class TestLexicalHandler(LexicalHandler): + def __init__(self, test_harness, *args, **kwargs): + super().__init__(*args, **kwargs) + self.test_harness = test_harness + + def startDTD(self, doctype, publicID, systemID): + self.test_harness.doctype = doctype + self.test_harness.publicID = publicID + self.test_harness.systemID = systemID + + def endDTD(self): + self.test_harness.end_of_dtd = True + + def comment(self, text): + self.test_harness.comments.append(text) + + self.parser = create_parser() + self.parser.setContentHandler(ContentHandler()) + self.parser.setProperty( + 'http://xml.org/sax/properties/lexical-handler', + TestLexicalHandler(self)) + source = InputSource() + source.setCharacterStream(self.test_data) + self.parser.parse(source) + self.assertEqual(self.doctype, self.specified_doctype) + self.assertIsNone(self.publicID) + self.assertIsNone(self.systemID) + self.assertTrue(self.end_of_dtd) + self.assertEqual(len(self.comments), + len(self.specified_comment)) + self.assertEqual(f' {self.specified_comment[0]} ', self.comments[0]) + + +class CDATAHandlerTest(unittest.TestCase): + def setUp(self): + self.parser = None + self.specified_chars = [] + self.specified_chars.append(('Parseable character data', False)) + self.specified_chars.append(('<> &% - assorted other XML junk.', True)) + self.char_index = 0 # Used to index specified results within handlers + self.test_data = StringIO() + self.test_data.write('\n') + self.test_data.write('\n') + self.test_data.write(f'{self.specified_chars[0][0]}\n') + self.test_data.write('\n') + self.test_data.write('\n') + self.test_data.write(f'\n') + self.test_data.write('\n') + self.test_data.write('\n') + self.test_data.seek(0) + + # Data received from handlers - to be validated + self.chardata = [] + self.in_cdata = False + + def test_handlers(self): + class TestLexicalHandler(LexicalHandler): + def __init__(self, test_harness, *args, **kwargs): + super().__init__(*args, **kwargs) + self.test_harness = test_harness + + def startCDATA(self): + self.test_harness.in_cdata = True + + def endCDATA(self): + self.test_harness.in_cdata = False + + class TestCharHandler(ContentHandler): + def __init__(self, test_harness, *args, **kwargs): + super().__init__(*args, **kwargs) + self.test_harness = test_harness + + def characters(self, content): + if content != '\n': + h = self.test_harness + t = h.specified_chars[h.char_index] + h.assertEqual(t[0], content) + h.assertEqual(t[1], h.in_cdata) + h.char_index += 1 + + self.parser = create_parser() + self.parser.setContentHandler(TestCharHandler(self)) + self.parser.setProperty( + 'http://xml.org/sax/properties/lexical-handler', + TestLexicalHandler(self)) + source = InputSource() + source.setCharacterStream(self.test_data) + self.parser.parse(source) + + self.assertFalse(self.in_cdata) + self.assertEqual(self.char_index, 2) + + def test_main(): run_unittest(MakeParserTest, ParseTest, @@ -1366,7 +1516,10 @@ def test_main(): StreamReaderWriterXmlgenTest, ExpatReaderTest, ErrorReportingTest, - XmlReaderTest) + XmlReaderTest, + LexicalHandlerTest, + CDATAHandlerTest) + if __name__ == "__main__": test_main() diff --git a/Lib/xml/sax/handler.py b/Lib/xml/sax/handler.py index 481733d2cbe6e5a..753aff80b64450b 100644 --- a/Lib/xml/sax/handler.py +++ b/Lib/xml/sax/handler.py @@ -340,3 +340,47 @@ def resolveEntity(self, publicId, systemId): property_xml_string, property_encoding, property_interning_dict] + + +class LexicalHandler: + """Optional SAX2 handler for lexical events. + + This handler is used to obtain lexical information about an XML + document, that is, information about how the document was encoded + (as opposed to what it contains, which is reported to the + ContentHandler), such as comments and CDATA marked section + boundaries. + + To set the LexicalHandler of an XMLReader, use the setProperty + method with the property identifier.""" + + def comment(self, content): + """Reports a comment anywhere in the document (including the + DTD and outside the document element). + + content is a string that holds the contents of the comment.""" + + def startDTD(self, name, public_id, system_id): + """Report the start of the DTD declarations, if the document + has an associated DTD. + + A startEntity event will be reported before declaration events + from the external DTD subset are reported, and this can be + used to infer from which subset DTD declarations derive. + + name is the name of the document element type, public_id the + public identifier of the DTD (or None if none were supplied) + and system_id the system identfier of the external subset (or + None if none were supplied).""" + + def endDTD(self): + """Signals the end of DTD declarations.""" + + def startCDATA(self): + """Reports the beginning of a CDATA marked section. + + The contents of the CDATA marked section will be reported + through the characters event.""" + + def endCDATA(self): + """Reports the end of a CDATA marked section.""" diff --git a/Misc/NEWS.d/next/Library/2020-06-17-23-49-45.bpo-35018.NP5_Qk.rst b/Misc/NEWS.d/next/Library/2020-06-17-23-49-45.bpo-35018.NP5_Qk.rst new file mode 100644 index 000000000000000..f764323ae631cfc --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-06-17-23-49-45.bpo-35018.NP5_Qk.rst @@ -0,0 +1,2 @@ +Add the :class:`xml.sax.handler.LexicalHandler` class that is present in +other SAX XML implementations. From 960ad4752f31c9ff58fa3488d20e286c6cfd164a Mon Sep 17 00:00:00 2001 From: Zackery Spytz Date: Sat, 20 Jun 2020 21:22:46 -0600 Subject: [PATCH 2/4] Update Doc/whatsnew/3.10.rst --- Doc/whatsnew/3.10.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index 89958450200f93a..8446f03b9cd710a 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -110,6 +110,13 @@ Added the *root_dir* and *dir_fd* parameters in :func:`~glob.glob` and :func:`~glob.iglob` which allow to specify the root directory for searching. (Contributed by Serhiy Storchaka in :issue:`38144`.) +xml +--- + +Add a :class:`~xml.sax.handler.LexicalHandler` class to the +:mod:`xml.sax.handler` module. +(Contributed by Jonathan Gossage and Zackery Spytz in :issue:`35018`.) + Optimizations ============= From 8c3ba60efffc3464db49bedc0c0aa6d8a7d4b7b0 Mon Sep 17 00:00:00 2001 From: Zackery Spytz Date: Sat, 20 Jun 2020 21:35:27 -0600 Subject: [PATCH 3/4] Make the suggested changes. --- Doc/library/xml.sax.handler.rst | 3 ++- Lib/xml/sax/handler.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Doc/library/xml.sax.handler.rst b/Doc/library/xml.sax.handler.rst index 97ba93f4789aa46..e3d49ba4c102e7f 100644 --- a/Doc/library/xml.sax.handler.rst +++ b/Doc/library/xml.sax.handler.rst @@ -433,7 +433,8 @@ well as section boundaries for the DTD and for any CDATA sections. The lexical handlers are used in the same manner as content handlers. Set the LexicalHandler of an XMLReader by using the setProperty method -with the property identifier. +with the property identifier +``'http://xml.org/sax/handlers/LexicalHandler'``. .. method:: LexicalHandler.comment(content) diff --git a/Lib/xml/sax/handler.py b/Lib/xml/sax/handler.py index 753aff80b64450b..49526975f67905a 100644 --- a/Lib/xml/sax/handler.py +++ b/Lib/xml/sax/handler.py @@ -352,7 +352,8 @@ class LexicalHandler: boundaries. To set the LexicalHandler of an XMLReader, use the setProperty - method with the property identifier.""" + method with the property identifier + 'http://xml.org/sax/handlers/LexicalHandler'.""" def comment(self, content): """Reports a comment anywhere in the document (including the From 9a69e80366f70d2b63daa16a8dd0a74597bd7234 Mon Sep 17 00:00:00 2001 From: scoder Date: Sun, 21 Jun 2020 08:57:57 +0200 Subject: [PATCH 4/4] Fix property ID string according to spec --- Doc/library/xml.sax.handler.rst | 2 +- Lib/xml/sax/handler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/xml.sax.handler.rst b/Doc/library/xml.sax.handler.rst index e3d49ba4c102e7f..3746a58c9b9558a 100644 --- a/Doc/library/xml.sax.handler.rst +++ b/Doc/library/xml.sax.handler.rst @@ -434,7 +434,7 @@ The lexical handlers are used in the same manner as content handlers. Set the LexicalHandler of an XMLReader by using the setProperty method with the property identifier -``'http://xml.org/sax/handlers/LexicalHandler'``. +``'http://xml.org/sax/properties/lexical-handler'``. .. method:: LexicalHandler.comment(content) diff --git a/Lib/xml/sax/handler.py b/Lib/xml/sax/handler.py index 49526975f67905a..e8d417e51942329 100644 --- a/Lib/xml/sax/handler.py +++ b/Lib/xml/sax/handler.py @@ -353,7 +353,7 @@ class LexicalHandler: To set the LexicalHandler of an XMLReader, use the setProperty method with the property identifier - 'http://xml.org/sax/handlers/LexicalHandler'.""" + 'http://xml.org/sax/properties/lexical-handler'.""" def comment(self, content): """Reports a comment anywhere in the document (including the