summaryrefslogtreecommitdiff
path: root/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder')
-rw-r--r--bs4/builder/__init__.py12
-rw-r--r--bs4/builder/_htmlparser.py14
-rw-r--r--bs4/builder/_lxml.py8
3 files changed, 25 insertions, 9 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index fdb3362..21454e6 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -93,7 +93,7 @@ class TreeBuilder(object):
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
-
+
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
@@ -125,7 +125,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
+
def feed(self, markup):
raise NotImplementedError()
@@ -235,11 +235,11 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
- # These are from HTML4, removed in HTML5.
- 'spacer', 'frame'
+
+ # These are from earlier versions of HTML and are removed in HTML5.
+ 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])
-
+
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 67890b3..ef9fd1e 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -64,7 +64,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
-
+
+ def error(self, msg):
+ """In Python 3, HTMLParser subclasses must implement error(), although this
+ requirement doesn't appear to be documented.
+
+ In Python 2, HTMLParser implements error() as raising an exception.
+
+ In any event, this method is called only on very strange markup and our best strategy
+ is to pretend it didn't happen and keep going.
+ """
+ warnings.warn(msg)
+
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
@@ -213,6 +224,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
+ parser.close()
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index d2ca287..3439271 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -5,9 +5,13 @@ __all__ = [
'LXMLTreeBuilder',
]
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError , e:
+ from collections import Callable
+
from io import BytesIO
from StringIO import StringIO
-import collections
from lxml import etree
from bs4.element import (
Comment,
@@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
- if isinstance(parser, collections.Callable):
+ if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser