From 8a511751f6a4b0c6f789695d000bf1f6074ce15d Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 24 Mar 2012 11:00:42 -0400 Subject: Pass data into XMLParser.feed() in chunks. [bug=963880] --- NEWS.txt | 3 +++ bs4/builder/_lxml.py | 14 +++++++++++++- bs4/testing.py | 8 ++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/NEWS.txt b/NEWS.txt index 097ee43..c423803 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,5 +1,8 @@ = 4.0.2 () = +* Worked around a possible bug in lxml that prevents non-tiny XML + documents from being parsed. [bug=963880, bug=963936] + * Fixed a bug where specifying `text` while searching for a tag only worked if `text` specified an exact string match. [bug=955942] diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 39ad396..d97b8d1 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -3,6 +3,7 @@ __all__ = [ 'LXMLTreeBuilder', ] +from StringIO import StringIO import collections from lxml import etree from bs4.element import Comment, Doctype, NamespacedAttribute @@ -25,6 +26,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Well, it's permissive by XML parser standards. features = [LXML, XML, FAST, PERMISSIVE] + CHUNK_SIZE = 512 + @property def default_parser(self): # This can either return a parser object or a class, which @@ -68,7 +71,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): dammit.contains_replacement_characters) def feed(self, markup): - self.parser.feed(markup) + if isinstance(markup, basestring): + markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + self.parser.feed(data) + while data != '': + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + self.parser.feed(data) self.parser.close() def close(self): diff --git a/bs4/testing.py b/bs4/testing.py index 1b73160..e9c505c 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -410,6 +410,14 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(soup.encode("utf-8"), markup) + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'\n' + + b'0' * (2**12) + + b'') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("

", "

") -- cgit v1.2.3