diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-20 14:05:38 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-20 14:05:38 -0500 |
commit | 60cb51632dce022d1a4aff18500d286e58e0bd5c (patch) | |
tree | 38120742636e4867b6284093ca4c19510623cf9a /bs4/builder/_htmlparser.py | |
parent | 8f6446b1e45ad69ecd255cac3d9b9939127c91c2 (diff) |
It's now possible to copy a BeautifulSoup object created with the html.parser treebuilder.
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 70 |
1 files changed, 38 insertions, 32 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 0ec878b..62473cf 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -38,35 +38,7 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' -class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): - - is_xml = False - features = [HTML, STRICT, HTMLPARSER] - - def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: - kwargs['strict'] = False - return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs) - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 4-tuple (markup, original encoding, encoding - declared within markup, whether any characters had to be - replaced with REPLACEMENT CHARACTER). - """ - if isinstance(markup, unicode): - return markup, None, None, False - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) - - def feed(self, markup): - super(HTMLParserTreeBuilder, self).feed(markup) - +class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): self.soup.handle_starttag(name, dict(attrs)) @@ -126,6 +98,40 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT: + kwargs['strict'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + return markup, None, None, False + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + parser.feed(markup) + + # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. @@ -152,7 +158,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: )* \s* # trailing whitespace """, re.VERBOSE) - HTMLParserTreeBuilder.locatestarttagend = locatestarttagend + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend from html.parser import tagfind, attrfind @@ -215,7 +221,7 @@ if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) - HTMLParserTreeBuilder.parse_starttag = parse_starttag - HTMLParserTreeBuilder.set_cdata_mode = set_cdata_mode + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode CONSTRUCTOR_TAKES_STRICT = True |