diff options
-rw-r--r-- | NEWS.txt | 19 | ||||
-rw-r--r-- | bs4/__init__.py | 4 | ||||
-rw-r--r-- | bs4/diagnose.py | 24 | ||||
-rw-r--r-- | doc/source/index.rst | 68 |
4 files changed, 95 insertions, 20 deletions
@@ -15,8 +15,23 @@ or from bs4 import _soup - This may change in the future, so don't use this in code that goes - into a file. + The alias may change in the future, so don't use this in code you're + going to run more than once. + +* Added the 'diagnose' submodule, which includes several useful + functions for reporting problems and doing tech support. + + * diagnose(data) tries the given markup on every installed parser, + reporting exceptions and displaying successes. If a parser is not + installed, diagnose() mentions this fact. + + * lxml_trace(data, html=True) runs the given markup through lxml's + XML parser or HTML parser, and prints out the parser events as + they happen. This helps you quickly determine whether a given + problem occurs in lxml code or Beautiful Soup code. + + * htmlparser_trace(data) is the same thing, but for Python's + built-in HTMLParser class. * The prettify() method now leaves the contents of <pre> tags alone. [bug=1095654] diff --git a/bs4/__init__.py b/bs4/__init__.py index 88177d6..a5e7a86 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.1.3" -__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" +__version__ = "4.2.0" +__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] diff --git a/bs4/diagnose.py b/bs4/diagnose.py index daaf523..e336633 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -1,13 +1,22 @@ """Diagnostic functions, mainly for use when doing tech support.""" from StringIO import StringIO from HTMLParser import HTMLParser -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry +import os import traceback import sys def diagnose(data): """Diagnostic suite for isolating common problems.""" + print "Diagnostic running on Beautiful Soup %s" % __version__ + print "Python version %s" % sys.version + + if hasattr(data, 'read'): + data = data.read() + elif os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + data = open(data).read() basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: for builder in builder_registry.builders: @@ -21,18 +30,25 @@ def diagnose(data): if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) + from lxml import etree + print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + + if 'html5lib' in basic_parsers: + import html5lib + print "Found html5lib version %s" % html5lib.__version__ + print for parser in basic_parsers: - print "Trying to parse your data with %s" % parser + print "Trying to parse your markup with %s" % parser success = False try: soup = BeautifulSoup(data, parser) success = True except Exception, e: - print "%s could not parse the document." % parser + print "%s could not parse the markup." % parser traceback.print_exc() if success: - print "Here's what %s did with the document:" % parser + print "Here's what %s did with the markup:" % parser print soup.prettify() print "-" * 80 diff --git a/doc/source/index.rst b/doc/source/index.rst index 79286ab..c106f00 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2633,6 +2633,62 @@ thought I'd mention it:: Troubleshooting =============== +``diagnose()`` +-------------- + +If you're having trouble understanding what Beautiful Soup does to a +document, pass it into the ``diagnose()`` function. (New in 4.2.0.) +Beautiful Soup will print out a report showing you how different +parsers handle the document, and tell you if you're missing a parser +that Beautiful Soup could be using:: + + from bs4.diagnose import diagnose + data = open("bad.html").read() + diagnose(data) + + # Diagnostic running on Beautiful Soup 4.2.0 + # Python version 2.7.3 (default, Aug 1 2012, 05:16:07) + # I noticed that html5lib is not installed. Installing it may help. + # Found lxml version 2.3.2.0 + # + # Trying to parse your data with html.parser + # Here's what html.parser did with the document: + # ... + +Just looking at the output of diagnose() may show you how to solve the +problem. Even if not, you can paste the output of ``diagnose()`` when +asking for help. + +Errors when parsing a document +------------------------------ + +There are two different kinds of parse errors. There are crashes, +where you feed a document to Beautiful Soup and it raises an +exception, usually an ``HTMLParser.HTMLParseError``. And there is +unexpected behavior, where a Beautiful Soup parse tree looks a lot +different than the document used to create it. + +Almost none of these problems turn out to be problems with Beautiful +Soup. This is not because Beautiful Soup is an amazingly well-written +piece of software. It's because Beautiful Soup doesn't include any +parsing code. Instead, it relies on external parsers. If one parser +isn't working on a certain document, the best solution is to try a +different parser. See `Installing a parser`_ for details and a parser +comparison. + +The most common parse errors are ``HTMLParser.HTMLParseError: +malformed start tag`` and ``HTMLParser.HTMLParseError: bad end +tag``. These are both generated by Python's built-in HTML parser +library, and the solution is to :ref:`install lxml or +html5lib. <parser-installation>` + +The most common type of unexpected behavior is that you can't find a +tag that you know is in the document. You saw it going in, but +``find_all()`` returns ``[]`` or ``find()`` returns ``None``. This is +another common problem with Python's built-in HTML parser, which +sometimes skips tags it doesn't understand. Again, the solution is to +:ref:`install lxml or html5lib. <parser-installation>` + Version mismatch problems ------------------------- @@ -2678,18 +2734,6 @@ Other parser problems parsers`_ for why this matters, and fix the problem by mentioning a specific parser library in the ``BeautifulSoup`` constructor. -* ``HTMLParser.HTMLParseError: malformed start tag`` or - ``HTMLParser.HTMLParseError: bad end tag`` - Caused by - giving Python's built-in HTML parser a document it can't handle. Any - other ``HTMLParseError`` is probably the same problem. Solution: - :ref:`Install lxml or html5lib. <parser-installation>` - -* If you can't find a tag that you know is in the document (that is, - ``find_all()`` returned ``[]`` or ``find()`` returned ``None``), - you're probably using Python's built-in HTML parser, which sometimes - skips tags it doesn't understand. Solution: :ref:`Install lxml or - html5lib. <parser-installation>` - * Because `HTML tags and attributes are case-insensitive <http://www.w3.org/TR/html5/syntax.html#syntax>`_, all three HTML parsers convert tag and attribute names to lowercase. That is, the |