summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt19
-rw-r--r--bs4/__init__.py4
-rw-r--r--bs4/diagnose.py24
-rw-r--r--doc/source/index.rst68
4 files changed, 95 insertions, 20 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 1daa84e..7abc700 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -15,8 +15,23 @@
or
from bs4 import _soup
- This may change in the future, so don't use this in code that goes
- into a file.
+ The alias may change in the future, so don't use this in code you're
+ going to run more than once.
+
+* Added the 'diagnose' submodule, which includes several useful
+ functions for reporting problems and doing tech support.
+
+ * diagnose(data) tries the given markup on every installed parser,
+ reporting exceptions and displaying successes. If a parser is not
+ installed, diagnose() mentions this fact.
+
+ * lxml_trace(data, html=True) runs the given markup through lxml's
+ XML parser or HTML parser, and prints out the parser events as
+ they happen. This helps you quickly determine whether a given
+ problem occurs in lxml code or Beautiful Soup code.
+
+ * htmlparser_trace(data) is the same thing, but for Python's
+ built-in HTMLParser class.
* The prettify() method now leaves the contents of <pre> tags
alone. [bug=1095654]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 88177d6..a5e7a86 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.3"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__version__ = "4.2.0"
+__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
index daaf523..e336633 100644
--- a/bs4/diagnose.py
+++ b/bs4/diagnose.py
@@ -1,13 +1,22 @@
"""Diagnostic functions, mainly for use when doing tech support."""
from StringIO import StringIO
from HTMLParser import HTMLParser
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
+import os
import traceback
import sys
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
+ print "Diagnostic running on Beautiful Soup %s" % __version__
+ print "Python version %s" % sys.version
+
+ if hasattr(data, 'read'):
+ data = data.read()
+ elif os.path.exists(data):
+ print '"%s" looks like a filename. Reading data from the file.' % data
+ data = open(data).read()
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
for builder in builder_registry.builders:
@@ -21,18 +30,25 @@ def diagnose(data):
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
+ from lxml import etree
+ print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+
+ if 'html5lib' in basic_parsers:
+ import html5lib
+ print "Found html5lib version %s" % html5lib.__version__
+ print
for parser in basic_parsers:
- print "Trying to parse your data with %s" % parser
+ print "Trying to parse your markup with %s" % parser
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
- print "%s could not parse the document." % parser
+ print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
- print "Here's what %s did with the document:" % parser
+ print "Here's what %s did with the markup:" % parser
print soup.prettify()
print "-" * 80
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 79286ab..c106f00 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2633,6 +2633,62 @@ thought I'd mention it::
Troubleshooting
===============
+``diagnose()``
+--------------
+
+If you're having trouble understanding what Beautiful Soup does to a
+document, pass it into the ``diagnose()`` function. (New in 4.2.0.)
+Beautiful Soup will print out a report showing you how different
+parsers handle the document, and tell you if you're missing a parser
+that Beautiful Soup could be using::
+
+ from bs4.diagnose import diagnose
+ data = open("bad.html").read()
+ diagnose(data)
+
+ # Diagnostic running on Beautiful Soup 4.2.0
+ # Python version 2.7.3 (default, Aug 1 2012, 05:16:07)
+ # I noticed that html5lib is not installed. Installing it may help.
+ # Found lxml version 2.3.2.0
+ #
+ # Trying to parse your data with html.parser
+ # Here's what html.parser did with the document:
+ # ...
+
+Just looking at the output of diagnose() may show you how to solve the
+problem. Even if not, you can paste the output of ``diagnose()`` when
+asking for help.
+
+Errors when parsing a document
+------------------------------
+
+There are two different kinds of parse errors. There are crashes,
+where you feed a document to Beautiful Soup and it raises an
+exception, usually an ``HTMLParser.HTMLParseError``. And there is
+unexpected behavior, where a Beautiful Soup parse tree looks a lot
+different than the document used to create it.
+
+Almost none of these problems turn out to be problems with Beautiful
+Soup. This is not because Beautiful Soup is an amazingly well-written
+piece of software. It's because Beautiful Soup doesn't include any
+parsing code. Instead, it relies on external parsers. If one parser
+isn't working on a certain document, the best solution is to try a
+different parser. See `Installing a parser`_ for details and a parser
+comparison.
+
+The most common parse errors are ``HTMLParser.HTMLParseError:
+malformed start tag`` and ``HTMLParser.HTMLParseError: bad end
+tag``. These are both generated by Python's built-in HTML parser
+library, and the solution is to :ref:`install lxml or
+html5lib. <parser-installation>`
+
+The most common type of unexpected behavior is that you can't find a
+tag that you know is in the document. You saw it going in, but
+``find_all()`` returns ``[]`` or ``find()`` returns ``None``. This is
+another common problem with Python's built-in HTML parser, which
+sometimes skips tags it doesn't understand. Again, the solution is to
+:ref:`install lxml or html5lib. <parser-installation>`
+
Version mismatch problems
-------------------------
@@ -2678,18 +2734,6 @@ Other parser problems
parsers`_ for why this matters, and fix the problem by mentioning a
specific parser library in the ``BeautifulSoup`` constructor.
-* ``HTMLParser.HTMLParseError: malformed start tag`` or
- ``HTMLParser.HTMLParseError: bad end tag`` - Caused by
- giving Python's built-in HTML parser a document it can't handle. Any
- other ``HTMLParseError`` is probably the same problem. Solution:
- :ref:`Install lxml or html5lib. <parser-installation>`
-
-* If you can't find a tag that you know is in the document (that is,
- ``find_all()`` returned ``[]`` or ``find()`` returned ``None``),
- you're probably using Python's built-in HTML parser, which sometimes
- skips tags it doesn't understand. Solution: :ref:`Install lxml or
- html5lib. <parser-installation>`
-
* Because `HTML tags and attributes are case-insensitive
<http://www.w3.org/TR/html5/syntax.html#syntax>`_, all three HTML
parsers convert tag and attribute names to lowercase. That is, the