summaryrefslogtreecommitdiff
path: root/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r--bs4/builder/_htmlparser.py111
1 files changed, 98 insertions, 13 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 4531407..2bb764f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -53,7 +53,11 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
-
+ """A subclass of the Python standard library's HTMLParser class, which
+ listens for HTMLParser events and translates them into calls
+ to Beautiful Soup's tree construction API.
+ """
+
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
@@ -67,20 +71,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element = []
def error(self, msg):
- """In Python 3, HTMLParser subclasses must implement error(), although this
- requirement doesn't appear to be documented.
+ """In Python 3, HTMLParser subclasses must implement error(), although
+ this requirement doesn't appear to be documented.
- In Python 2, HTMLParser implements error() as raising an exception.
+ In Python 2, HTMLParser implements error() by raising an exception,
+ which we don't want to do.
- In any event, this method is called only on very strange markup and our best strategy
- is to pretend it didn't happen and keep going.
+ In any event, this method is called only on very strange
+ markup and our best strategy is to pretend it didn't happen
+ and keep going.
"""
warnings.warn(msg)
def handle_startendtag(self, name, attrs):
- # This is only called when the markup looks like
- # <tag/>.
+ """Handle an incoming empty-element tag.
+ This is only called when the markup looks like <tag/>.
+
+ :param name: Name of the tag.
+ :param attrs: Dictionary of the tag's attributes.
+ """
# is_startend() tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We
# know that this is an empty-element tag and we want to call
@@ -89,6 +99,14 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_endtag(name)
def handle_starttag(self, name, attrs, handle_empty_element=True):
+ """Handle an opening tag, e.g. '<tag>'
+
+ :param name: Name of the tag.
+ :param attrs: Dictionary of the tag's attributes.
+ :param handle_empty_element: True if this tag is known to be
+ an empty-element tag (i.e. there is not expected to be any
+ closing tag).
+ """
# XXX namespace
attr_dict = {}
for key, value in attrs:
@@ -121,6 +139,13 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.already_closed_empty_element.append(name)
def handle_endtag(self, name, check_already_closed=True):
+ """Handle a closing tag, e.g. '</tag>'
+
+ :param name: A tag name.
+ :param check_already_closed: True if this tag is expected to
+ be the closing portion of an empty-element tag,
+ e.g. '<tag></tag>'.
+ """
#print "END", name
if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag.
@@ -132,9 +157,16 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.handle_endtag(name)
def handle_data(self, data):
+ """Handle some textual data that shows up between tags."""
self.soup.handle_data(data)
def handle_charref(self, name):
+ """Handle a numeric character reference by converting it to the
+ corresponding Unicode character and treating it as textual
+ data.
+
+ :param name: Character number, possibly in hexadecimal.
+ """
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
@@ -168,6 +200,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_entityref(self, name):
+ """Handle a named entity reference by converting it to the
+ corresponding Unicode character and treating it as textual
+ data.
+
+ :param name: Name of the entity reference.
+ """
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
@@ -181,17 +219,29 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.handle_data(data)
def handle_comment(self, data):
+ """Handle an HTML comment.
+
+ :param data: The text of the comment.
+ """
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
+ """Handle a DOCTYPE declaration.
+
+ :param data: The text of the declaration.
+ """
self.soup.endData()
data = data[len("DOCTYPE "):]
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
+ """Handle a declaration of unknown type -- probably a CDATA block.
+
+ :param data: The text of the declaration.
+ """
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
@@ -202,13 +252,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData(cls)
def handle_pi(self, data):
+ """Handle a processing instruction.
+
+ :param data: The text of the instruction.
+ """
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
+ """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
+ found in the Python standard library.
+ """
is_xml = False
picklable = True
NAME = HTMLPARSER
@@ -219,6 +275,16 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
TRACKS_LINE_NUMBERS = True
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+ """Constructor.
+
+ :param parser_args: Positional arguments to pass into
+ the BeautifulSoupHTMLParser constructor, once it's
+ invoked.
+ :param parser_kwargs: Keyword arguments to pass into
+ the BeautifulSoupHTMLParser constructor, once it's
+ invoked.
+ :param kwargs: Keyword arguments for the superclass constructor.
+ """
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
@@ -230,15 +296,31 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
- """
- :return: A 4-tuple (markup, original encoding, encoding
- declared within markup, whether any characters had to be
- replaced with REPLACEMENT CHARACTER).
+
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ :param markup: Some markup -- probably a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
"""
if isinstance(markup, unicode):
+ # Parse Unicode as-is.
yield (markup, None, None, False)
return
+ # Ask UnicodeDammit to sniff the most likely encoding.
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
exclude_encodings=exclude_encodings)
@@ -247,6 +329,9 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
dammit.contains_replacement_characters)
def feed(self, markup):
+ """Run some incoming markup through some parsing process,
+ populating the `BeautifulSoup` object in self.soup.
+ """
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup