diff options
-rw-r--r-- | bs4/__init__.py | 22 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 3 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 9 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 92 | ||||
-rw-r--r-- | bs4/dammit.py | 21 | ||||
-rw-r--r-- | bs4/testing.py | 13 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 6 |
8 files changed, 115 insertions, 53 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index a949d6d..956f26e 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -26,7 +26,7 @@ __all__ = ['BeautifulSoup'] import re import warnings -from .builder import builder_registry +from .builder import builder_registry, ParserRejectedMarkup from .dammit import UnicodeDammit from .element import ( CData, @@ -160,18 +160,17 @@ class BeautifulSoup(Tag): self.parse_only = parse_only - self.reset() - if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) = ( - self.builder.prepare_markup(markup, from_encoding)) - - try: - self._feed() - except StopParsing: - pass + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup(markup, from_encoding)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup, e: + pass # Clear out the markup and remove the builder's circular # reference to this object. @@ -353,7 +352,6 @@ class BeautifulStoneSoup(BeautifulSoup): class StopParsing(Exception): pass - class FeatureNotFound(ValueError): pass diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index bae453e..e59dae2 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -296,6 +296,9 @@ def register_treebuilders_from(module): # Register the builder while we're at it. this_module.builder_registry.register(obj) +class ParserRejectedMarkup(Exception): + pass + # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index e439ac8..3bbc9a9 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -27,7 +27,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding - return markup, None, None, False + yield (markup, None, None, False) # These methods are defined by Beautiful Soup. def feed(self, markup): diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index e34c9fa..2b98969 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -133,13 +133,14 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): - return markup, None, None, False + yield (markup, None, None, False) + return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) def feed(self, markup): args, kwargs = self.parser_args diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index be35d70..601b793 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -13,9 +13,10 @@ from bs4.builder import ( HTML, HTMLTreeBuilder, PERMISSIVE, + ParserRejectedMarkup, TreeBuilder, XML) -from bs4.dammit import UnicodeDammit +from bs4.dammit import EncodingDetector LXML = 'lxml' @@ -33,22 +34,30 @@ class LXMLTreeBuilderForXML(TreeBuilder): # standard. DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} - @property - def default_parser(self): + def default_parser(self, encoding): # This can either return a parser object or a class, which # will be instantiated with default arguments. - return etree.XMLParser(target=self, strip_cdata=False, recover=True) + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser def __init__(self, parser=None, empty_element_tags=None): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) - if parser is None: - # Use the default parser. - parser = self.default_parser - if isinstance(parser, collections.Callable): - # Instantiate the parser with default arguments - parser = parser(target=self, strip_cdata=False) - self.parser = parser self.soup = None self.nsmaps = [self.DEFAULT_NSMAPS] @@ -63,33 +72,53 @@ class LXMLTreeBuilderForXML(TreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ - :return: A 3-tuple (markup, original encoding, encoding - declared within markup). + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. """ if isinstance(markup, unicode): - return markup, None, None, False + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + if isinstance(markup, unicode): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) + detector = EncodingDetector(markup, try_encodings, is_html) + for encoding in detector.encodings: + yield (markup, encoding, document_declared_encoding, False) def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) elif isinstance(markup, unicode): markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) - self.parser.feed(data) - while data != '': - # Now call feed() on the rest of the data, chunk by chunk. - data = markup.read(self.CHUNK_SIZE) - if data != '': - self.parser.feed(data) - self.parser.close() + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError), e: + raise ParserRejectedMarkup(str(e)) def close(self): self.nsmaps = [self.DEFAULT_NSMAPS] @@ -186,13 +215,18 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST, PERMISSIVE] is_xml = False - @property - def default_parser(self): + def default_parser(self, encoding): return etree.HTMLParser def feed(self, markup): - self.parser.feed(markup) - self.parser.close() + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError), e: + raise ParserRejectedMarkup(str(e)) + def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" diff --git a/bs4/dammit.py b/bs4/dammit.py index cb6d354..a8acef9 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -224,9 +224,11 @@ class EncodingDetector: self.sniffed_encoding = None def _usable(self, encoding, tried): - if encoding not in tried and encoding is not None: - tried.add(encoding) - return True + if encoding is not None: + encoding = encoding.lower() + if encoding not in tried: + tried.add(encoding) + return True return False @property @@ -386,18 +388,17 @@ class UnicodeDammit: def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): - self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.detector = EncodingDetector(markup, override_encodings, is_html) if markup == '' or isinstance(markup, unicode): self.markup = markup self.unicode_markup = unicode(markup) self.original_encoding = None return - self.detector = EncodingDetector(markup, override_encodings, is_html) self.markup, ignore = self.detector.strip_byte_order_mark(markup) u = None @@ -496,6 +497,16 @@ class UnicodeDammit: newdata = unicode(data, encoding, errors) return newdata + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + @property + def is_html(self): + return self.detector.is_html + def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ diff --git a/bs4/testing.py b/bs4/testing.py index d8ff6b7..c363a89 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -279,6 +279,14 @@ class HTMLTreeBuilderSmokeTest(object): # to detect any differences between them. # + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") @@ -482,6 +490,11 @@ class XMLTreeBuilderSmokeTest(object): encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded) + def test_can_parse_unicode_document(self): + markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + def test_popping_namespaced_tag(self): markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' soup = self.soup(markup) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 80458de..27cb2d9 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -4,14 +4,16 @@ import re import warnings try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - LXML_PRESENT = True import lxml.etree + LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION except ImportError, e: LXML_PRESENT = False LXML_VERSION = (0,) +if LXML_PRESENT: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + from bs4 import ( BeautifulSoup, BeautifulStoneSoup, |