summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/_lxml.py14
-rw-r--r--bs4/testing.py8
3 files changed, 24 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 097ee43..c423803 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
= 4.0.2 () =
+* Worked around a possible bug in lxml that prevents non-tiny XML
+ documents from being parsed. [bug=963880, bug=963936]
+
* Fixed a bug where specifying `text` while searching for a tag only
worked if `text` specified an exact string match. [bug=955942]
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 39ad396..d97b8d1 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -3,6 +3,7 @@ __all__ = [
'LXMLTreeBuilder',
]
+from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute
@@ -25,6 +26,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
+ CHUNK_SIZE = 512
+
@property
def default_parser(self):
# This can either return a parser object or a class, which
@@ -68,7 +71,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
dammit.contains_replacement_characters)
def feed(self, markup):
- self.parser.feed(markup)
+ if isinstance(markup, basestring):
+ markup = StringIO(markup)
+ # Call feed() at least once, even if the markup is empty,
+ # or the parser won't be initialized.
+ data = markup.read(self.CHUNK_SIZE)
+ self.parser.feed(data)
+ while data != '':
+ # Now call feed() on the rest of the data, chunk by chunk.
+ data = markup.read(self.CHUNK_SIZE)
+ self.parser.feed(data)
self.parser.close()
def close(self):
diff --git a/bs4/testing.py b/bs4/testing.py
index 1b73160..e9c505c 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -410,6 +410,14 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(soup.encode("utf-8"), markup)
+ def test_large_xml_document(self):
+ """A large XML document should come out the same as it went in."""
+ markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+ + b'0' * (2**12)
+ + b'</root>')
+ soup = self.soup(markup)
+ self.assertEqual(soup.encode("utf-8"), markup)
+
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
self.assertSoupEquals("<p>", "<p/>")