diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 14 | ||||
-rw-r--r-- | bs4/testing.py | 8 |
3 files changed, 24 insertions, 1 deletions
@@ -1,5 +1,8 @@ = 4.0.2 () = +* Worked around a possible bug in lxml that prevents non-tiny XML + documents from being parsed. [bug=963880, bug=963936] + * Fixed a bug where specifying `text` while searching for a tag only worked if `text` specified an exact string match. [bug=955942] diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 39ad396..d97b8d1 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -3,6 +3,7 @@ __all__ = [ 'LXMLTreeBuilder', ] +from StringIO import StringIO import collections from lxml import etree from bs4.element import Comment, Doctype, NamespacedAttribute @@ -25,6 +26,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Well, it's permissive by XML parser standards. features = [LXML, XML, FAST, PERMISSIVE] + CHUNK_SIZE = 512 + @property def default_parser(self): # This can either return a parser object or a class, which @@ -68,7 +71,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): dammit.contains_replacement_characters) def feed(self, markup): - self.parser.feed(markup) + if isinstance(markup, basestring): + markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + self.parser.feed(data) + while data != '': + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + self.parser.feed(data) self.parser.close() def close(self): diff --git a/bs4/testing.py b/bs4/testing.py index 1b73160..e9c505c 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -410,6 +410,14 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(soup.encode("utf-8"), markup) + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' + + b'0' * (2**12) + + b'</root>') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("<p>", "<p/>") |