summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/__init__.py22
-rw-r--r--bs4/builder/__init__.py3
-rw-r--r--bs4/builder/_html5lib.py2
-rw-r--r--bs4/builder/_htmlparser.py9
-rw-r--r--bs4/builder/_lxml.py92
-rw-r--r--bs4/dammit.py21
-rw-r--r--bs4/testing.py13
-rw-r--r--bs4/tests/test_lxml.py6
8 files changed, 115 insertions, 53 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index a949d6d..956f26e 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -26,7 +26,7 @@ __all__ = ['BeautifulSoup']
import re
import warnings
-from .builder import builder_registry
+from .builder import builder_registry, ParserRejectedMarkup
from .dammit import UnicodeDammit
from .element import (
CData,
@@ -160,18 +160,17 @@ class BeautifulSoup(Tag):
self.parse_only = parse_only
- self.reset()
-
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
- (self.markup, self.original_encoding, self.declared_html_encoding,
- self.contains_replacement_characters) = (
- self.builder.prepare_markup(markup, from_encoding))
-
- try:
- self._feed()
- except StopParsing:
- pass
+ for (self.markup, self.original_encoding, self.declared_html_encoding,
+ self.contains_replacement_characters) in (
+ self.builder.prepare_markup(markup, from_encoding)):
+ self.reset()
+ try:
+ self._feed()
+ break
+ except ParserRejectedMarkup, e:
+ pass
# Clear out the markup and remove the builder's circular
# reference to this object.
@@ -353,7 +352,6 @@ class BeautifulStoneSoup(BeautifulSoup):
class StopParsing(Exception):
pass
-
class FeatureNotFound(ValueError):
pass
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index bae453e..e59dae2 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -296,6 +296,9 @@ def register_treebuilders_from(module):
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
+class ParserRejectedMarkup(Exception):
+ pass
+
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index e439ac8..3bbc9a9 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -27,7 +27,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
- return markup, None, None, False
+ yield (markup, None, None, False)
# These methods are defined by Beautiful Soup.
def feed(self, markup):
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index e34c9fa..2b98969 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -133,13 +133,14 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
- return markup, None, None, False
+ yield (markup, None, None, False)
+ return
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
+ yield (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
def feed(self, markup):
args, kwargs = self.parser_args
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index be35d70..601b793 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -13,9 +13,10 @@ from bs4.builder import (
HTML,
HTMLTreeBuilder,
PERMISSIVE,
+ ParserRejectedMarkup,
TreeBuilder,
XML)
-from bs4.dammit import UnicodeDammit
+from bs4.dammit import EncodingDetector
LXML = 'lxml'
@@ -33,22 +34,30 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
- @property
- def default_parser(self):
+ def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
- return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+ if self._default_parser is not None:
+ return self._default_parser
+ return etree.XMLParser(
+ target=self, strip_cdata=False, recover=True, encoding=encoding)
+
+ def parser_for(self, encoding):
+ # Use the default parser.
+ parser = self.default_parser(encoding)
+
+ if isinstance(parser, collections.Callable):
+ # Instantiate the parser with default arguments
+ parser = parser(target=self, strip_cdata=False, encoding=encoding)
+ return parser
def __init__(self, parser=None, empty_element_tags=None):
+ # TODO: Issue a warning if parser is present but not a
+ # callable, since that means there's no way to create new
+ # parsers for different encodings.
+ self._default_parser = parser
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
- if parser is None:
- # Use the default parser.
- parser = self.default_parser
- if isinstance(parser, collections.Callable):
- # Instantiate the parser with default arguments
- parser = parser(target=self, strip_cdata=False)
- self.parser = parser
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS]
@@ -63,33 +72,53 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
- :return: A 3-tuple (markup, original encoding, encoding
- declared within markup).
+ :yield: A series of 4-tuples.
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for parsing the document.
"""
if isinstance(markup, unicode):
- return markup, None, None, False
+ # We were given Unicode. Maybe lxml can parse Unicode on
+ # this system?
+ yield markup, None, document_declared_encoding, False
+ if isinstance(markup, unicode):
+ # No, apparently not. Convert the Unicode to UTF-8 and
+ # tell lxml to parse it as UTF-8.
+ yield (markup.encode("utf8"), "utf8",
+ document_declared_encoding, False)
+
+ # Instead of using UnicodeDammit to convert the bytestring to
+ # Unicode using different encodings, use EncodingDetector to
+ # iterate over the encodings, and tell lxml to try to parse
+ # the document as each one in turn.
+ is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
+ detector = EncodingDetector(markup, try_encodings, is_html)
+ for encoding in detector.encodings:
+ yield (markup, encoding, document_declared_encoding, False)
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup)
+
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = markup.read(self.CHUNK_SIZE)
- self.parser.feed(data)
- while data != '':
- # Now call feed() on the rest of the data, chunk by chunk.
- data = markup.read(self.CHUNK_SIZE)
- if data != '':
- self.parser.feed(data)
- self.parser.close()
+ try:
+ self.parser = self.parser_for(self.soup.original_encoding)
+ self.parser.feed(data)
+ while len(data) != 0:
+ # Now call feed() on the rest of the data, chunk by chunk.
+ data = markup.read(self.CHUNK_SIZE)
+ if len(data) != 0:
+ self.parser.feed(data)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError), e:
+ raise ParserRejectedMarkup(str(e))
def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS]
@@ -186,13 +215,18 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE]
is_xml = False
- @property
- def default_parser(self):
+ def default_parser(self, encoding):
return etree.HTMLParser
def feed(self, markup):
- self.parser.feed(markup)
- self.parser.close()
+ encoding = self.soup.original_encoding
+ try:
+ self.parser = self.parser_for(encoding)
+ self.parser.feed(markup)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError), e:
+ raise ParserRejectedMarkup(str(e))
+
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
diff --git a/bs4/dammit.py b/bs4/dammit.py
index cb6d354..a8acef9 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -224,9 +224,11 @@ class EncodingDetector:
self.sniffed_encoding = None
def _usable(self, encoding, tried):
- if encoding not in tried and encoding is not None:
- tried.add(encoding)
- return True
+ if encoding is not None:
+ encoding = encoding.lower()
+ if encoding not in tried:
+ tried.add(encoding)
+ return True
return False
@property
@@ -386,18 +388,17 @@ class UnicodeDammit:
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False):
- self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
+ self.detector = EncodingDetector(markup, override_encodings, is_html)
if markup == '' or isinstance(markup, unicode):
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
- self.detector = EncodingDetector(markup, override_encodings, is_html)
self.markup, ignore = self.detector.strip_byte_order_mark(markup)
u = None
@@ -496,6 +497,16 @@ class UnicodeDammit:
newdata = unicode(data, encoding, errors)
return newdata
+ @property
+ def declared_html_encoding(self):
+ if not self.is_html:
+ return None
+ return self.detector.declared_encoding
+
+ @property
+ def is_html(self):
+ return self.detector.is_html
+
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \
diff --git a/bs4/testing.py b/bs4/testing.py
index d8ff6b7..c363a89 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -279,6 +279,14 @@ class HTMLTreeBuilderSmokeTest(object):
# to detect any differences between them.
#
+ def test_can_parse_unicode_document(self):
+ # A seemingly innocuous document... but it's in Unicode! And
+ # it contains characters that can't be represented in the
+ # encoding found in the declaration! The horror!
+ markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+ soup = self.soup(markup)
+ self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
strainer = SoupStrainer("b")
@@ -482,6 +490,11 @@ class XMLTreeBuilderSmokeTest(object):
encoded = soup.encode()
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
+ def test_can_parse_unicode_document(self):
+ markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+ soup = self.soup(markup)
+ self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 80458de..27cb2d9 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -4,14 +4,16 @@ import re
import warnings
try:
- from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
- LXML_PRESENT = True
import lxml.etree
+ LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e:
LXML_PRESENT = False
LXML_VERSION = (0,)
+if LXML_PRESENT:
+ from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,