diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-18 13:14:57 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-04-18 13:14:57 -0400 |
commit | 622fead3fef2d870a85d570c5416166076b8c8c4 (patch) | |
tree | a44178e5a8cb3c4da547164511f2dd289174e327 | |
parent | 3d0ae02cc3d0b947ef6102b31f4b354eec9b543a (diff) |
Print a warning on HTMLParseErrors to let people know they should install an external parser.
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 14 | ||||
-rw-r--r-- | doc/source/index.rst | 3 |
3 files changed, 16 insertions, 4 deletions
@@ -4,6 +4,9 @@ definitions ending with two question marks instead of one. [bug=984258] +* Print a warning on HTMLParseErrors to let people know they should + install a better parser library. + = 4.0.4 (20120416) = * Fixed a bug that sometimes created disconnected trees. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 3dee51b..d5d8681 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -4,8 +4,12 @@ __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from HTMLParser import ( + HTMLParser, + HTMLParseError, + ) import sys +import warnings # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' # argument, which we'd like to set to False. Unfortunately, @@ -138,8 +142,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup - parser.feed(markup) - + try: + parser.feed(markup) + except HTMLParseError, e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse this document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a diff --git a/doc/source/index.rst b/doc/source/index.rst index 5aab90e..734851d 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2548,7 +2548,8 @@ Other parser problems parsers`_ for why this matters, and fix the problem by mentioning a specific parser library in the ``BeautifulSoup`` constructor. -* ``HTMLParser.HTMLParseError: malformed start tag`` - Caused by +* ``HTMLParser.HTMLParseError: malformed start tag`` or + ``HTMLParser.HTMLParseError: bad end tag`` - Caused by giving Python's built-in HTML parser a document it can't handle. Any other ``HTMLParseError`` is probably the same problem. Solution: :ref:`Install lxml or html5lib. <parser-installation>` |