summaryrefslogtreecommitdiff
path: root/bs4/builder/_lxml.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r--bs4/builder/_lxml.py14
1 files changed, 13 insertions, 1 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 39ad396..d97b8d1 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -3,6 +3,7 @@ __all__ = [
'LXMLTreeBuilder',
]
+from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute
@@ -25,6 +26,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
+ CHUNK_SIZE = 512
+
@property
def default_parser(self):
# This can either return a parser object or a class, which
@@ -68,7 +71,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
dammit.contains_replacement_characters)
def feed(self, markup):
- self.parser.feed(markup)
+ if isinstance(markup, basestring):
+ markup = StringIO(markup)
+ # Call feed() at least once, even if the markup is empty,
+ # or the parser won't be initialized.
+ data = markup.read(self.CHUNK_SIZE)
+ self.parser.feed(data)
+ while data != '':
+ # Now call feed() on the rest of the data, chunk by chunk.
+ data = markup.read(self.CHUNK_SIZE)
+ self.parser.feed(data)
self.parser.close()
def close(self):