From 8a511751f6a4b0c6f789695d000bf1f6074ce15d Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Sat, 24 Mar 2012 11:00:42 -0400
Subject: Pass data into XMLParser.feed() in chunks. [bug=963880]

---
 NEWS.txt             |  3 +++
 bs4/builder/_lxml.py | 14 +++++++++++++-
 bs4/testing.py       |  8 ++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/NEWS.txt b/NEWS.txt
index 097ee43..c423803 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
 = 4.0.2 () =
 
+* Worked around a possible bug in lxml that prevents non-tiny XML
+  documents from being parsed. [bug=963880, bug=963936]
+
 * Fixed a bug where specifying `text` while searching for a tag only
   worked if `text` specified an exact string match. [bug=955942]
 
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 39ad396..d97b8d1 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -3,6 +3,7 @@ __all__ = [
     'LXMLTreeBuilder',
     ]
 
+from StringIO import StringIO
 import collections
 from lxml import etree
 from bs4.element import Comment, Doctype, NamespacedAttribute
@@ -25,6 +26,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     # Well, it's permissive by XML parser standards.
     features = [LXML, XML, FAST, PERMISSIVE]
 
+    CHUNK_SIZE = 512
+
     @property
     def default_parser(self):
         # This can either return a parser object or a class, which
@@ -68,7 +71,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                 dammit.contains_replacement_characters)
 
     def feed(self, markup):
-        self.parser.feed(markup)
+        if isinstance(markup, basestring):
+            markup = StringIO(markup)
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = markup.read(self.CHUNK_SIZE)
+        self.parser.feed(data)
+        while data != '':
+            # Now call feed() on the rest of the data, chunk by chunk.
+            data = markup.read(self.CHUNK_SIZE)
+            self.parser.feed(data)
         self.parser.close()
 
     def close(self):
diff --git a/bs4/testing.py b/bs4/testing.py
index 1b73160..e9c505c 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -410,6 +410,14 @@ class XMLTreeBuilderSmokeTest(object):
         soup = self.soup(markup)
         self.assertEqual(soup.encode("utf-8"), markup)
 
+    def test_large_xml_document(self):
+        """A large XML document should come out the same as it went in."""
+        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+                  + b'0' * (2**12)
+                  + b'</root>')
+        soup = self.soup(markup)
+        self.assertEqual(soup.encode("utf-8"), markup)
+
 
     def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
         self.assertSoupEquals("<p>", "<p/>")
-- 
cgit v1.2.3