summaryrefslogtreecommitdiff
path: root/bs4/builder/_lxml.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r--bs4/builder/_lxml.py92
1 files changed, 63 insertions, 29 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index be35d70..601b793 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -13,9 +13,10 @@ from bs4.builder import (
HTML,
HTMLTreeBuilder,
PERMISSIVE,
+ ParserRejectedMarkup,
TreeBuilder,
XML)
-from bs4.dammit import UnicodeDammit
+from bs4.dammit import EncodingDetector
LXML = 'lxml'
@@ -33,22 +34,30 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
- @property
- def default_parser(self):
+ def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
- return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+ if self._default_parser is not None:
+ return self._default_parser
+ return etree.XMLParser(
+ target=self, strip_cdata=False, recover=True, encoding=encoding)
+
+ def parser_for(self, encoding):
+ # Use the default parser.
+ parser = self.default_parser(encoding)
+
+ if isinstance(parser, collections.Callable):
+ # Instantiate the parser with default arguments
+ parser = parser(target=self, strip_cdata=False, encoding=encoding)
+ return parser
def __init__(self, parser=None, empty_element_tags=None):
+ # TODO: Issue a warning if parser is present but not a
+ # callable, since that means there's no way to create new
+ # parsers for different encodings.
+ self._default_parser = parser
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
- if parser is None:
- # Use the default parser.
- parser = self.default_parser
- if isinstance(parser, collections.Callable):
- # Instantiate the parser with default arguments
- parser = parser(target=self, strip_cdata=False)
- self.parser = parser
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS]
@@ -63,33 +72,53 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
- :return: A 3-tuple (markup, original encoding, encoding
- declared within markup).
+ :yield: A series of 4-tuples.
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for parsing the document.
"""
if isinstance(markup, unicode):
- return markup, None, None, False
+ # We were given Unicode. Maybe lxml can parse Unicode on
+ # this system?
+ yield markup, None, document_declared_encoding, False
+ if isinstance(markup, unicode):
+ # No, apparently not. Convert the Unicode to UTF-8 and
+ # tell lxml to parse it as UTF-8.
+ yield (markup.encode("utf8"), "utf8",
+ document_declared_encoding, False)
+
+ # Instead of using UnicodeDammit to convert the bytestring to
+ # Unicode using different encodings, use EncodingDetector to
+ # iterate over the encodings, and tell lxml to try to parse
+ # the document as each one in turn.
+ is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
+ detector = EncodingDetector(markup, try_encodings, is_html)
+ for encoding in detector.encodings:
+ yield (markup, encoding, document_declared_encoding, False)
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup)
+
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = markup.read(self.CHUNK_SIZE)
- self.parser.feed(data)
- while data != '':
- # Now call feed() on the rest of the data, chunk by chunk.
- data = markup.read(self.CHUNK_SIZE)
- if data != '':
- self.parser.feed(data)
- self.parser.close()
+ try:
+ self.parser = self.parser_for(self.soup.original_encoding)
+ self.parser.feed(data)
+ while len(data) != 0:
+ # Now call feed() on the rest of the data, chunk by chunk.
+ data = markup.read(self.CHUNK_SIZE)
+ if len(data) != 0:
+ self.parser.feed(data)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError), e:
+ raise ParserRejectedMarkup(str(e))
def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS]
@@ -186,13 +215,18 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE]
is_xml = False
- @property
- def default_parser(self):
+ def default_parser(self, encoding):
return etree.HTMLParser
def feed(self, markup):
- self.parser.feed(markup)
- self.parser.close()
+ encoding = self.soup.original_encoding
+ try:
+ self.parser = self.parser_for(encoding)
+ self.parser.feed(markup)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError), e:
+ raise ParserRejectedMarkup(str(e))
+
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""