summaryrefslogtreecommitdiff
path: root/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r--bs4/builder/_htmlparser.py80
1 files changed, 46 insertions, 34 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ec6d456..64dfb27 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -38,35 +38,7 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
-class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
-
- is_xml = False
- features = [HTML, STRICT, HTMLPARSER]
-
- def __init__(self, *args, **kwargs):
- if CONSTRUCTOR_TAKES_STRICT:
- kwargs['strict'] = False
- return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs)
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- """
- :return: A 4-tuple (markup, original encoding, encoding
- declared within markup, whether any characters had to be
- replaced with REPLACEMENT CHARACTER).
- """
- if isinstance(markup, unicode):
- return markup, None, None, False
-
- try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
-
- def feed(self, markup):
- super(HTMLParserTreeBuilder, self).feed(markup)
-
+class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
self.soup.handle_starttag(name, None, dict(attrs))
@@ -81,9 +53,15 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed.
if name.startswith('x'):
- data = unichr(int(name.lstrip('x'), 16))
+ real_name = int(name.lstrip('x'), 16)
else:
- data = unichr(int(name))
+ real_name = int(name)
+
+ try:
+ data = unichr(real_name)
+ except (ValueError, OverflowError), e:
+ data = u"\N{REPLACEMENT CHARACTER}"
+
self.handle_data(data)
def handle_entityref(self, name):
@@ -121,6 +99,40 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+
+ is_xml = False
+ features = [HTML, STRICT, HTMLPARSER]
+
+ def __init__(self, *args, **kwargs):
+ if CONSTRUCTOR_TAKES_STRICT:
+ kwargs['strict'] = False
+ self.parser_args = (args, kwargs)
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 4-tuple (markup, original encoding, encoding
+ declared within markup, whether any characters had to be
+ replaced with REPLACEMENT CHARACTER).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None, False
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
+
+ def feed(self, markup):
+ args, kwargs = self.parser_args
+ parser = BeautifulSoupHTMLParser(*args, **kwargs)
+ parser.soup = self.soup
+ parser.feed(markup)
+
+
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
@@ -147,7 +159,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
)*
\s* # trailing whitespace
""", re.VERBOSE)
- HTMLParserTreeBuilder.locatestarttagend = locatestarttagend
+ BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
@@ -210,7 +222,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
- HTMLParserTreeBuilder.parse_starttag = parse_starttag
- HTMLParserTreeBuilder.set_cdata_mode = set_cdata_mode
+ BeautifulSoupHTMLParser.parse_starttag = parse_starttag
+ BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True