diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-05-15 11:10:12 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-05-15 11:10:12 -0400 |
commit | 87c7f93d7feb8bd982dbbc42d814352b60be344d (patch) | |
tree | 3905df101f631ca66055ab2af50c36d34227268b | |
parent | ada530f6bc24bf4e536c1c859d798b836ec0799c (diff) | |
parent | c3cc17f0dda7d378890a12fd8b5c29de9f923dab (diff) |
Merge.
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/diagnose.py | 15 | ||||
-rw-r--r-- | bs4/element.py | 11 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 2 | ||||
-rw-r--r-- | doc/source/conf.py | 256 | ||||
-rw-r--r-- | doc/source/index.rst | 64 | ||||
-rw-r--r-- | setup.py | 2 |
7 files changed, 321 insertions, 33 deletions
@@ -1,4 +1,4 @@ -= 4.2.0 (Unreleased) = += 4.2.0 (20130514) = * The Tag.select() method now supports a much wider variety of CSS selectors. @@ -60,7 +60,7 @@ * Stop a crash when unwisely messing with a tag that's been decomposed. [bug=1097699] -* Now that lxml's segfault on invalid doctype has been fixed, fix a +* Now that lxml's segfault on invalid doctype has been fixed, fixed a corresponding problem on the Beautiful Soup end that was previously invisible. [bug=984936] diff --git a/bs4/diagnose.py b/bs4/diagnose.py index d4e657c..4b5f6e4 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -14,11 +14,6 @@ def diagnose(data): print "Diagnostic running on Beautiful Soup %s" % __version__ print "Python version %s" % sys.version - if hasattr(data, 'read'): - data = data.read() - elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data - data = open(data).read() basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: for builder in builder_registry.builders: @@ -38,6 +33,16 @@ def diagnose(data): if 'html5lib' in basic_parsers: import html5lib print "Found html5lib version %s" % html5lib.__version__ + + if hasattr(data, 'read'): + data = data.read() + elif os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + data = open(data).read() + elif data.startswith("http:") or data.startswith("https:"): + print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data + print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + return print for parser in basic_parsers: diff --git a/bs4/element.py b/bs4/element.py index 1f121f4..f6864f2 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1392,10 +1392,13 @@ class Tag(PageElement): def recursiveChildGenerator(self): return self.descendants - # This was kind of misleading because has_key() (attributes) was - # different from __in__ (contents). has_key() is gone in Python 3, - # anyway. - has_key = has_attr + def has_key(self, key): + """This was kind of misleading because has_key() (attributes) + was different from __in__ (contents). has_key() is gone in + Python 3, anyway.""" + warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( + key)) + return self.has_attr(key) # Next, a couple classes to represent queries and their results. class SoupStrainer(object): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index b07de8c..f60485b 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1618,7 +1618,7 @@ class TestSoupSelector(TreeTest): for el in els: self.assertEqual(el.name, 'p') self.assertEqual(els[1]['class'], ['onep']) - self.assertFalse(els[0].has_key('class')) + self.assertFalse(els[0].has_attr('class')) def test_a_bunch_of_emptys(self): for selector in ('div#main del', 'div#main div.oops', 'div div#main'): diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 0000000..102c3cf --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +# +# Beautiful Soup documentation build configuration file, created by +# sphinx-quickstart on Thu Jan 26 11:22:55 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Beautiful Soup' +copyright = u'2012, Leonard Richardson' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '4' +# The full version, including alpha/beta/rc tags. +release = '4.2.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'BeautifulSoupdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', + u'Leonard Richardson', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'beautifulsoup', u'Beautiful Soup Documentation', + [u'Leonard Richardson'], 1) +] + + +# -- Options for Epub output --------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = u'Beautiful Soup' +epub_author = u'Leonard Richardson' +epub_publisher = u'Leonard Richardson' +epub_copyright = u'2012, Leonard Richardson' + +# The language of the text. It defaults to the language option +# or en if the language is not set. +#epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +#epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +#epub_identifier = '' + +# A unique identification for the text. +#epub_uid = '' + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_pre_files = [] + +# HTML files shat should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +#epub_post_files = [] + +# A list of files that should not be packed into the epub file. +#epub_exclude_files = [] + +# The depth of the table of contents in toc.ncx. +#epub_tocdepth = 3 + +# Allow duplicate toc entries. +#epub_tocdup = True diff --git a/doc/source/index.rst b/doc/source/index.rst index 8e0204b..5d4c0fe 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -31,7 +31,10 @@ Getting help If you have questions about Beautiful Soup, or run into problems, `send mail to the discussion group -<https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup>`_. +<https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup>`_. If +your problem involves parsing an HTML document, be sure to mention +:ref:`what the diagnose() function says <diagnose>` about +that document. Quick Start =========== @@ -455,6 +458,12 @@ them. In particular, since a string can't contain anything (the way a tag may contain a string or another tag), strings don't support the ``.contents`` or ``.string`` attributes, or the ``find()`` method. +If you want to use a ``NavigableString`` outside of Beautiful Soup, +you should call ``unicode()`` on it to turn it into a normal Python +Unicode string. If you don't, your string will carry around a +reference to the entire Beautiful Soup parse tree, even when you're +done using Beautiful Soup. This is a big waste of memory. + ``BeautifulSoup`` ----------------- @@ -970,7 +979,7 @@ Searching the tree ================== Beautiful Soup defines a lot of methods for searching the parse tree, -but they're all very similar. I'm going to spend a lot of time explain +but they're all very similar. I'm going to spend a lot of time explaining the two most popular methods: ``find()`` and ``find_all()``. The other methods take almost exactly the same arguments, so I'll just cover them briefly. @@ -995,7 +1004,7 @@ Once again, I'll be using the "three sisters" document as an example:: soup = BeautifulSoup(html_doc) By passing in a filter to an argument like ``find_all()``, you can -isolate whatever parts of the document you're interested. +zoom in on the parts of the document you're interested in. Kinds of filters ---------------- @@ -1095,7 +1104,7 @@ Here's a function that returns ``True`` if a tag defines the "class" attribute but doesn't define the "id" attribute:: def has_class_but_no_id(tag): - return tag.has_key('class') and not tag.has_key('id') + return tag.has_attr('class') and not tag.has_attr('id') Pass this function into ``find_all()`` and you'll pick up all the <p> tags:: @@ -1973,9 +1982,6 @@ whatever's inside that tag. It's good for stripping out markup:: Like ``replace_with()``, ``unwrap()`` returns the tag that was replaced. -(In earlier versions of Beautiful Soup, ``unwrap()`` was called -``replace_with_children()``, and that name will still work.) - Output ====== @@ -2265,7 +2271,7 @@ into an <html> tag.:: BeautifulSoup("<a><b /></a>", "xml") # <?xml version="1.0" encoding="utf-8"?> - # <a><b /></a> + # <a><b/></a> There are also differences between HTML parsers. If you give Beautiful Soup a perfectly-formed HTML document, these differences won't @@ -2556,8 +2562,8 @@ ignore everything that wasn't an <a> tag in the first place. The document are parsed. You just create a ``SoupStrainer`` and pass it in to the ``BeautifulSoup`` constructor as the ``parse_only`` argument. -(Note that *this feature won't work if you're using the html5lib -parser*. If you use html5lib, the whole document will be parsed, no +(Note that *this feature won't work if you're using the html5lib parser*. +If you use html5lib, the whole document will be parsed, no matter what. This is because html5lib constantly rearranges the parse tree as it works, and if some part of the document didn't actually make it into the parse tree, it'll crash. To avoid confusion, in the @@ -2638,14 +2644,16 @@ thought I'd mention it:: Troubleshooting =============== +.. _diagnose: + ``diagnose()`` -------------- If you're having trouble understanding what Beautiful Soup does to a -document, pass it into the ``diagnose()`` function. (New in 4.2.0.) -Beautiful Soup will print out a report showing you how different -parsers handle the document, and tell you if you're missing a parser -that Beautiful Soup could be using:: +document, pass the document into the ``diagnose()`` function. (New in +Beautiful Soup 4.2.0.) Beautiful Soup will print out a report showing +you how different parsers handle the document, and tell you if you're +missing a parser that Beautiful Soup could be using:: from bs4.diagnose import diagnose data = open("bad.html").read() @@ -2746,16 +2754,11 @@ Other parser problems preserve mixed-case or uppercase tags and attributes, you'll need to :ref:`parse the document as XML. <parsing-xml>` +.. _misc: Miscellaneous ------------- -* ``KeyError: [attr]`` - Caused by accessing ``tag['attr']`` when the - tag in question doesn't define the ``attr`` attribute. The most - common errors are ``KeyError: 'href'`` and ``KeyError: - 'class'``. Use ``tag.get('attr')`` if you're not sure ``attr`` is - defined, just as you would with a Python dictionary. - * ``UnicodeEncodeError: 'charmap' codec can't encode character u'\xfoo' in position bar`` (or just about any other ``UnicodeEncodeError``) - This is not a problem with Beautiful Soup. @@ -2768,6 +2771,27 @@ Miscellaneous solution is to explicitly encode the Unicode string into UTF-8 with ``u.encode("utf8")``. +* ``KeyError: [attr]`` - Caused by accessing ``tag['attr']`` when the + tag in question doesn't define the ``attr`` attribute. The most + common errors are ``KeyError: 'href'`` and ``KeyError: + 'class'``. Use ``tag.get('attr')`` if you're not sure ``attr`` is + defined, just as you would with a Python dictionary. + +* ``AttributeError: 'ResultSet' object has no attribute 'foo'`` - This + usually happens because you expected ``find_all()`` to return a + single tag or string. But ``find_all()`` returns a _list_ of tags + and strings--a ``ResultSet`` object. You need to iterate over the + list and look at the ``.foo`` of each one. Or, if you really only + want one result, you need to use ``find()`` instead of + ``find_all()``. + +* ``AttributeError: 'NoneType' object has no attribute 'foo'`` - This + usually happens because you called ``find()`` and then tried to + access the `.foo`` attribute of the result. But in your case, + ``find()`` didn't find anything, so it returned ``None``, instead of + returning a tag or a string. You need to figure out why your + ``find()`` call isn't returning anything. + Improving Performance --------------------- @@ -7,7 +7,7 @@ except ImportError: from distutils.command.build_py import build_py setup(name="beautifulsoup4", - version = "4.1.3", + version = "4.2.0", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", |