4 files changed, 95 insertions, 20 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 1daa84e..7abc700 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -15,8 +15,23 @@
    or
   from bs4 import _soup
 
-  This may change in the future, so don't use this in code that goes
-  into a file.
+  The alias may change in the future, so don't use this in code you're
+  going to run more than once.
+
+* Added the 'diagnose' submodule, which includes several useful
+  functions for reporting problems and doing tech support.
+
+  * diagnose(data) tries the given markup on every installed parser,
+    reporting exceptions and displaying successes. If a parser is not
+    installed, diagnose() mentions this fact.
+
+  * lxml_trace(data, html=True) runs the given markup through lxml's
+    XML parser or HTML parser, and prints out the parser events as
+    they happen. This helps you quickly determine whether a given
+    problem occurs in lxml code or Beautiful Soup code.
+
+  * htmlparser_trace(data) is the same thing, but for Python's
+    built-in HTMLParser class.
 
 * The prettify() method now leaves the contents of <pre> tags
   alone. [bug=1095654]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 88177d6..a5e7a86 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """
 
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.3"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__version__ = "4.2.0"
+__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
 __license__ = "MIT"
 
 __all__ = ['BeautifulSoup']
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
index daaf523..e336633 100644
--- a/bs4/diagnose.py
+++ b/bs4/diagnose.py
@@ -1,13 +1,22 @@
 """Diagnostic functions, mainly for use when doing tech support."""
 from StringIO import StringIO
 from HTMLParser import HTMLParser
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, __version__
 from bs4.builder import builder_registry
+import os
 import traceback
 import sys
 
 def diagnose(data):
     """Diagnostic suite for isolating common problems."""
+    print "Diagnostic running on Beautiful Soup %s" % __version__
+    print "Python version %s" % sys.version
+
+    if hasattr(data, 'read'):
+        data = data.read()
+    elif os.path.exists(data):
+        print '"%s" looks like a filename. Reading data from the file.' % data
+        data = open(data).read()
     basic_parsers = ["html.parser", "html5lib", "lxml"]
     for name in basic_parsers:
         for builder in builder_registry.builders:
@@ -21,18 +30,25 @@ def diagnose(data):
 
     if 'lxml' in basic_parsers:
         basic_parsers.append(["lxml", "xml"])
+        from lxml import etree
+        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+
+    if 'html5lib' in basic_parsers:
+        import html5lib
+        print "Found html5lib version %s" % html5lib.__version__
+    print
 
     for parser in basic_parsers:
-        print "Trying to parse your data with %s" % parser
+        print "Trying to parse your markup with %s" % parser
         success = False
         try:
             soup = BeautifulSoup(data, parser)
             success = True
         except Exception, e:
-            print "%s could not parse the document." % parser
+            print "%s could not parse the markup." % parser
             traceback.print_exc()
         if success:
-            print "Here's what %s did with the document:" % parser
+            print "Here's what %s did with the markup:" % parser
             print soup.prettify()
 
         print "-" * 80
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 79286ab..c106f00 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2633,6 +2633,62 @@ thought I'd mention it::
 Troubleshooting
 ===============
 
+``diagnose()``
+--------------
+
+If you're having trouble understanding what Beautiful Soup does to a
+document, pass it into the ``diagnose()`` function. (New in 4.2.0.)
+Beautiful Soup will print out a report showing you how different
+parsers handle the document, and tell you if you're missing a parser
+that Beautiful Soup could be using::
+
+ from bs4.diagnose import diagnose
+ data = open("bad.html").read()
+ diagnose(data)
+
+ # Diagnostic running on Beautiful Soup 4.2.0
+ # Python version 2.7.3 (default, Aug  1 2012, 05:16:07)
+ # I noticed that html5lib is not installed. Installing it may help.
+ # Found lxml version 2.3.2.0
+ #
+ # Trying to parse your data with html.parser
+ # Here's what html.parser did with the document:
+ # ...
+
+Just looking at the output of diagnose() may show you how to solve the
+problem. Even if not, you can paste the output of ``diagnose()`` when
+asking for help.
+
+Errors when parsing a document
+------------------------------
+
+There are two different kinds of parse errors. There are crashes,
+where you feed a document to Beautiful Soup and it raises an
+exception, usually an ``HTMLParser.HTMLParseError``. And there is
+unexpected behavior, where a Beautiful Soup parse tree looks a lot
+different than the document used to create it.
+
+Almost none of these problems turn out to be problems with Beautiful
+Soup. This is not because Beautiful Soup is an amazingly well-written
+piece of software. It's because Beautiful Soup doesn't include any
+parsing code. Instead, it relies on external parsers. If one parser
+isn't working on a certain document, the best solution is to try a
+different parser. See `Installing a parser`_ for details and a parser
+comparison.
+
+The most common parse errors are ``HTMLParser.HTMLParseError:
+malformed start tag`` and ``HTMLParser.HTMLParseError: bad end
+tag``. These are both generated by Python's built-in HTML parser
+library, and the solution is to :ref:`install lxml or
+html5lib. <parser-installation>`
+
+The most common type of unexpected behavior is that you can't find a
+tag that you know is in the document. You saw it going in, but
+``find_all()`` returns ``[]`` or ``find()`` returns ``None``. This is
+another common problem with Python's built-in HTML parser, which
+sometimes skips tags it doesn't understand.  Again, the solution is to
+:ref:`install lxml or html5lib. <parser-installation>`
+
 Version mismatch problems
 -------------------------
 
@@ -2678,18 +2734,6 @@ Other parser problems
   parsers`_ for why this matters, and fix the problem by mentioning a
   specific parser library in the ``BeautifulSoup`` constructor.
 
-* ``HTMLParser.HTMLParseError: malformed start tag`` or
-  ``HTMLParser.HTMLParseError: bad end tag`` - Caused by
-  giving Python's built-in HTML parser a document it can't handle. Any
-  other ``HTMLParseError`` is probably the same problem. Solution:
-  :ref:`Install lxml or html5lib. <parser-installation>`
-
-* If you can't find a tag that you know is in the document (that is,
-  ``find_all()`` returned ``[]`` or ``find()`` returned ``None``),
-  you're probably using Python's built-in HTML parser, which sometimes
-  skips tags it doesn't understand. Solution: :ref:`Install lxml or
-  html5lib. <parser-installation>`
-
 * Because `HTML tags and attributes are case-insensitive
   <http://www.w3.org/TR/html5/syntax.html#syntax>`_, all three HTML
   parsers convert tag and attribute names to lowercase. That is, the