From 43aeaf51780466e023418f7dfd1f456614c061e2 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 7 Feb 2012 23:40:06 -0500 Subject: Package the docs with the code. --- doc/source/6.1.jpg | Bin 22619 -> 0 bytes doc/source/conf.py | 256 ------ doc/source/index.rst | 2427 -------------------------------------------------- 3 files changed, 2683 deletions(-) delete mode 100644 doc/source/6.1.jpg delete mode 100644 doc/source/conf.py delete mode 100644 doc/source/index.rst (limited to 'doc/source') diff --git a/doc/source/6.1.jpg b/doc/source/6.1.jpg deleted file mode 100644 index 97014f0..0000000 Binary files a/doc/source/6.1.jpg and /dev/null differ diff --git a/doc/source/conf.py b/doc/source/conf.py deleted file mode 100644 index 56c0939..0000000 --- a/doc/source/conf.py +++ /dev/null @@ -1,256 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Beautiful Soup documentation build configuration file, created by -# sphinx-quickstart on Thu Jan 26 11:22:55 2012. -# -# This file is execfile()d with the current directory set to its containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys, os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ----------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'Beautiful Soup' -copyright = u'2012, Leonard Richardson' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '4' -# The full version, including alpha/beta/rc tags. -release = '4.0.0' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = [] - -# The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - - -# -- Options for HTML output --------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'BeautifulSoupdoc' - - -# -- Options for LaTeX output -------------------------------------------------- - -# The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' - -# The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', 'BeautifulSoup.tex', u'Beautiful Soup Documentation', - u'Leonard Richardson', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Additional stuff for the LaTeX preamble. -#latex_preamble = '' - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output -------------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'beautifulsoup', u'Beautiful Soup Documentation', - [u'Leonard Richardson'], 1) -] - - -# -- Options for Epub output --------------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = u'Beautiful Soup' -epub_author = u'Leonard Richardson' -epub_publisher = u'Leonard Richardson' -epub_copyright = u'2012, Leonard Richardson' - -# The language of the text. It defaults to the language option -# or en if the language is not set. -#epub_language = '' - -# The scheme of the identifier. Typical schemes are ISBN or URL. -#epub_scheme = '' - -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -#epub_identifier = '' - -# A unique identification for the text. -#epub_uid = '' - -# HTML files that should be inserted before the pages created by sphinx. -# The format is a list of tuples containing the path and title. -#epub_pre_files = [] - -# HTML files shat should be inserted after the pages created by sphinx. -# The format is a list of tuples containing the path and title. -#epub_post_files = [] - -# A list of files that should not be packed into the epub file. -#epub_exclude_files = [] - -# The depth of the table of contents in toc.ncx. -#epub_tocdepth = 3 - -# Allow duplicate toc entries. -#epub_tocdup = True diff --git a/doc/source/index.rst b/doc/source/index.rst deleted file mode 100644 index fa0648d..0000000 --- a/doc/source/index.rst +++ /dev/null @@ -1,2427 +0,0 @@ -Beautiful Soup Documentation -============================ - -.. image:: 6.1.jpg - :align: right - :alt: "The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself." - -`Beautiful Soup `_ is a -Python library for pulling data out of HTML and XML files. It works -with your favorite parser to provide idiomatic ways of navigating, -searching, and modifying the parse tree. It commonly saves programmers -hours or days of work. - -These instructions illustrate all major features of Beautiful Soup 4, -with examples. I show you what the library is good for, how it works, -how to use it, how to make it do what you want, and what to do when it -violates your expectations. - -The examples in this documentation should work the same way in Python -2.7 and Python 3.2. - -You might be looking for the documentation for `Beautiful Soup 3 -`_. If -you want to learn about the differences between Beautiful Soup 3 and -Beautiful Soup 4, see `Porting code to BS4`_. - -Getting help ------------- - -If you have questions about Beautiful Soup, or run into problems, -`send mail to the discussion group -`_. - -Quick Start -=========== - -Here's an HTML document I'll be using as an example throughout this -document. It's part of a story from `Alice in Wonderland`:: - - html_doc = """ - The Dormouse's story - -

The Dormouse's story

- -

Once upon a time there were three little sisters; and their names were - Elsie, - Lacie and - Tillie; - and they lived at the bottom of a well.

- -

...

- """ - -Running the "three sisters" document through Beautiful Soup gives us a -``BeautifulSoup`` object, which represents the document as a nested -data structure:: - - from bs4 import BeautifulSoup - soup = BeautifulSoup(html_doc) - - print(soup.prettify()) - # - # - # - # The Dormouse's story - # - # - # - #

- # - # The Dormouse's story - # - #

- #

- # Once upon a time there were three little sisters; and their names were - # - # Elsie - # - # , - # - # Lacie - # - # and - # - # Tillie - # - # ; and they lived at the bottom of a well. - #

- #

- # ... - #

- # - # - -Here are some simple ways to navigate that data structure:: - - soup.title - # The Dormouse's story - - soup.title.name - # u'title' - - soup.title.string - # u'The Dormouse's story' - - soup.title.parent.name - # u'head' - - soup.p - #

The Dormouse's story

- - soup.p['class'] - # u'title' - - soup.a - # Elsie - - soup.find_all('a') - # [Elsie, - # Lacie, - # Tillie] - - soup.find(id="link3") - # Tillie - -One common task is extracting all the URLs found within a page's tags:: - - for link in soup.find_all('a'): - print(link.get('href')) - # http://example.com/elsie - # http://example.com/lacie - # http://example.com/tillie - -Another common task is extracting all the text from a page:: - - print(soup.get_text()) - # The Dormouse's story - # - # The Dormouse's story - # - # Once upon a time there were three little sisters; and their names were - # Elsie, - # Lacie and - # Tillie; - # and they lived at the bottom of a well. - # - # ... - -Does this look like what you need? If so, read on. - -Installing Beautiful Soup -========================= - -Beautiful Soup 4 is published through PyPi, so you can install it with -``easy_install`` or ``pip``. The package name is ``beautifulsoup4``, -and the same package works on Python 2 and Python 3. - -:kbd:`$ easy_install beautifulsoup4` -:kbd:`$ pip install beautifulsoup4` - -(The ``BeautifulSoup`` package is probably `not` what you want. That's -the previous major release, `Beautiful Soup 3`_. Lots of software uses -BS3, so it's still available, but if you're writing new code you -should install ``beautifulsoup4``.) - -You can also `download the Beautiful Soup 4 source tarball -`_ and -install it with ``setup.py``. The license for Beautiful Soup allows -you to package the entire library with your application, allowing you -to copy the ``bs4`` directory into your application's codebase. - -I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it -should work with other recent versions. - -.. _parser-installation: - -Be sure to install a good parser! ---------------------------------- - -Beautiful Soup uses a plugin system that supports a number of popular -Python parsers. If no third-party parsers are installed, Beautiful -Soup uses the HTML parser that comes with Python. In recent releases -of Python (2.7.2 and 3.2.2), this parser works pretty well at handling -bad HTML. In older releases, it's not so good. - -Even if you're using a recent release of Python, I recommend you -install the `lxml parser `_ if possible. It's much -faster than Python's built-in parser. It works with both Python 2 and -Python 3, and it parses HTML and XML very well. Beautiful Soup will -detect that you have lxml installed, and use it instead of Python's -built-in parser. - -Depending on your setup, you might install lxml with one of these commands: - -:kbd:`$ apt-get install python-lxml` - -:kbd:`$ easy_install lxml` - -:kbd:`$ pip install lxml` - -If you're using Python 2, another alternative is the pure-Python -`html5lib parser `_, which parses -HTML the way a web browser does. Depending on your setup, you might -install html5lib with one of these commands: - -:kbd:`$ apt-get install python-html5lib` - -:kbd:`$ easy_install html5lib` - -:kbd:`$ pip install html5lib` - -Making the soup -=============== - -To parse a document, pass it into the ``BeautifulSoup`` -constructor. You can pass in a string or an open filehandle:: - - from bs4 import BeautifulSoup - - soup = BeautifulSoup(open("index.html")) - - soup = BeautifulSoup("data") - -First, the document is converted to Unicode, and HTML entities are -converted to Unicode characters:: - - BeautifulSoup("Sacré bleu!") - Sacré bleu! - -Beautiful Soup then parses the document using the best available -parser. It will use an HTML parser unless you specifically tell it to -use an XML parser. (See `Choosing a parser`_.) - -Kinds of objects -================ - -Beautiful Soup transforms a complex HTML document into a complex tree -of Python objects. But you'll only ever have to deal with about four -`kinds` of objects. - -.. _Tag: - -``Tag`` -------- - -A ``Tag`` object corresponds to an XML or HTML tag in the original document:: - - soup = BeautifulSoup('Extremely bold') - tag = soup.b - type(tag) - # - -Tags have a lot of attributes and methods, and I'll cover most of them -in `Navigating the tree`_ and `Searching the tree`_. For now, the most -important features of a tag are its name and attributes. - -Name -^^^^ - -Every tag has a name, accessible as ``.name``:: - - tag.name - # u'b' - -If you change a tag's name, the change will be reflected in any HTML -markup generated by Beautiful Soup:: - - tag.name = "blockquote" - tag - #
Extremely bold
- -Attributes -^^^^^^^^^^ - -A tag may have any number of attributes. The tag ```` has an attribute "class" whose value is -"boldest". You can access a tag's attributes by treating the tag like -a dictionary:: - - tag['class'] - # u'boldest' - -You can access that dictionary directly as ``.attrs``:: - - tag.attrs - # {u'class': u'boldest'} - -You can add, remove, and modify a tag's attributes. Again, this is -done by treating the tag as a dictionary:: - - tag['class'] = 'verybold' - tag['id'] = 1 - tag - #
Extremely bold
- - del tag['class'] - del tag['id'] - tag - #
Extremely bold
- -``NavigableString`` -------------------- - -A string corresponds to a bit of text within a tag. Beautiful Soup -defines the ``NavigableString`` class to contain these bits of text:: - - tag.string - # u'Extremely bold' - type(tag.string) - # - -A ``NavigableString`` is just like a Python Unicode string, except -that it also supports some of the features described in `Navigating -the tree`_ and `Searching the tree`_. You can convert a -``NavigableString`` to a Unicode string with ``unicode()``:: - - unicode_string = unicode(tag.string) - unicode_string - # u'Extremely bold' - type(unicode_string) - # - -You can't edit a string in place, but you can replace one string with -another, using :ref:`replace_with`:: - - tag.string.replace_with("No longer bold") - tag - #
No longer bold
- -``NavigableString`` supports most of the features described in -`Navigating the tree`_ and `Searching the tree`_, but not all of -them. In particular, since a string can't contain anything (the way a -tag may contain a string or another tag), strings don't support the -``.contents`` or ``.string`` attributes, or the `find()` method. - -``BeautifulSoup`` ------------------ - -The ``BeautifulSoup`` object itself represents the document as a -whole. For most purposes, you can treat it as a :ref:`Tag` -object. This means it supports most of the methods described in -`Navigating the tree`_ and `Searching the tree`_. - -Since the ``BeautifulSoup`` object doesn't correspond to an actual -HTML or XML tag, it has no name and no attributes. But sometimes it's -useful to look at its ``.name``, so it's been given the special -``.name`` "[document]":: - - soup.name - # u'[document]' - -Comments and other special strings ----------------------------------- - -``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost -everything you'll see in an HTML or XML file, but there are a few -leftover bits. The only one you'll probably ever need to worry about -is the comment:: - - markup = "" - soup = BeautifulSoup(markup) - comment = soup.b.string - type(comment) - # - -The ``Comment`` object is just a special type of ``NavigableString``:: - - comment - # u'Hey, buddy. Want to buy a used parser' - -But when it appears as part of an HTML document, a ``Comment`` is -displayed with special formatting:: - - print(soup.b.prettify()) - # - # - # - -Beautiful Soup defines classes for anything else that might show up in -an XML document: ``CData``, ``ProcessingInstruction``, -``Declaration``, and ``Doctype``. Just like ``Comment``, these classes -are subclasses of ``NavigableString`` that add something extra to the -string. Here's an example that replaces the comment with a CDATA -block:: - - from bs4 import CData - cdata = CData("A CDATA block") - comment.replace_with(cdata) - - print(soup.b.prettify()) - # - # - # - - -Navigating the tree -=================== - -Here's the "Three sisters" HTML document again:: - - html_doc = """ - The Dormouse's story - -

The Dormouse's story

- -
- -

...

- """ - - from bs4 import BeautifulSoup - soup = BeautifulSoup(html_doc) - -I'll use this as an example to show you how to move from one part of -a document to another. - -Going down ----------- - -Tags may contain strings and other tags. These elements are the tag's -`children`. Beautiful Soup provides a lot of different attributes for -navigating and iterating over a tag's children. - -Note that Beautiful Soup strings don't support any of these -attributes, because a string can't have children. - -Navigating using tag names -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The simplest way to navigate the parse tree is to say the name of the -tag you want. If you want the tag, just say ``soup.head``:: - - soup.head - # The Dormouse's story - - soup.title - # The Dormouse's story - -You can do use this trick again and again to zoom in on a certain part -of the parse tree. This code gets the first tag beneath the tag:: - - soup.body.b - # The Dormouse's story - -Using a tag name as an attribute will give you only the `first` tag by that -name:: - - soup.a - # Elsie - -If you need to get `all` the tags, or anything more complicated -than the first tag with a certain name, you'll need to use one of the -methods described in `Searching the tree`_, such as `find_all()`:: - - soup.find_all('a') - # [Elsie, - # Lacie, - # Tillie] - -``.contents`` and ``.children`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A tag's children are available in a list called ``.contents``:: - - head_tag = soup.head - head_tag - # The Dormouse's story - - head_tag.contents - [The Dormouse's story] - - title_tag = head_tag.contents[0] - title_tag - # The Dormouse's story - title_tag.contents - # [u'The Dormouse's story'] - -The ``BeautifulSoup`` object itself has children. In this case, the - tag is the child of the ``BeautifulSoup`` object.:: - - len(soup.contents) - # 1 - soup.contents[0].name - # u'html' - -A string does not have ``.contents``, because it can't contain -anything:: - - text = title_tag.contents[0] - text.contents - # AttributeError: 'NavigableString' object has no attribute 'contents' - -Instead of getting them as a list, you can iterate over a tag's -children using the ``.children`` generator:: - - for child in title_tag.children: - print(child) - # The Dormouse's story - -``.descendants`` -^^^^^^^^^^^^^^^^ - -The ``.contents`` and ``.children`` attributes only consider a tag's -`direct` children. For instance, the tag has a single direct -child--the tag:: - - head_tag.contents - # [<title>The Dormouse's story] - -But the tag itself has a child: the string "The Dormouse's -story". There's a sense in which that string is also a child of the -<head> tag. The ``.descendants`` attribute lets you iterate over `all` -of a tag's children, recursively: its direct children, the children of -its direct children, and so on:: - - for child in head_tag.descendants: - print(child) - # <title>The Dormouse's story - # The Dormouse's story - -The tag has only one child, but it has two descendants: the - tag and the <title> tag's child. The ``BeautifulSoup`` object -only has one direct child (the <html> tag), but it has a whole lot of -descendants:: - - len(list(soup.children)) - # 1 - len(list(soup.descendants)) - # 25 - -.. _.string: - -``.string`` -^^^^^^^^^^^ - -If a tag has only one child, and that child is a string, the string is -made available as ``.string``:: - - title_tag.string - # u'The Dormouse's story' - -If a tag's only child is another tag, and `that` tag has a -``.string``, then the parent tag is considered to have the same -``.string`` as its child:: - - head_tag.contents - # [<title>The Dormouse's story] - - head_tag.string - # u'The Dormouse's story' - -If a tag contains more than one thing, then it's not clear what -``.string`` should refer to, so ``.string`` is defined to be -``None``:: - - print(soup.html.string) - # None - -.. _string-generators: - -``.strings`` and ``stripped_strings`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If there's more than one thing inside a tag, you can still look at -just the strings. Use the ``.strings`` generator:: - - for string in soup.strings: - print(repr(string)) - # u"The Dormouse's story" - # u'\n\n' - # u"The Dormouse's story" - # u'\n\n' - # u'Once upon a time there were three little sisters; and their names were\n' - # u'Elsie' - # u',\n' - # u'Lacie' - # u' and\n' - # u'Tillie' - # u';\nand they lived at the bottom of a well.' - # u'\n\n' - # u'...' - # u'\n' - -These strings tend to have a lot of extra whitespace, which you can -remove by using the ``.stripped_strings`` generator instead:: - - for string in soup.stripped_strings: - print(repr(string)) - # u"The Dormouse's story" - # u"The Dormouse's story" - # u'Once upon a time there were three little sisters; and their names were' - # u'Elsie' - # u',' - # u'Lacie' - # u'and' - # u'Tillie' - # u';\nand they lived at the bottom of a well.' - # u'...' - -Here, strings consisting entirely of whitespace are ignored, and -whitespace at the beginning and end of strings is removed. - -Going up --------- - -Continuing the "family tree" analogy, every tag and every string has a -`parent`: the tag that contains it. - -.. _.parent: - -``.parent`` -^^^^^^^^^^^ - -You can access an element's parent with the ``.parent`` attribute. In -the example "three sisters" document, the tag is the parent -of the tag:: - - title_tag = soup.title - title_tag - # <title>The Dormouse's story - title_tag.parent - # The Dormouse's story - -The title string itself has a parent: the tag that contains -it:: - - title_tag.string.parent - # <title>The Dormouse's story - -The parent of a top-level tag like is the ``BeautifulSoup`` object -itself:: - - html_tag = soup.html - type(html_tag.parent) - # - -And the ``.parent`` of a ``BeautifulSoup`` object is defined as None:: - - print(soup.parent) - # None - -.. _.parents: - -``.parents`` -^^^^^^^^^^^^ - -You can iterate over all of an element's parents with -``.parents``. This example uses ``.parents`` to travel from an tag -buried deep within the document, to the very top of the document:: - - link = soup.a - link - # Elsie - for parent in link.parents: - if parent is None: - print(parent) - else: - print(parent.name) - # p - # body - # html - # [document] - # None - -Going sideways --------------- - -Consider a simple document like this:: - - sibling_soup = BeautifulSoup("text1text2") - print(sibling_soup.prettify()) - # - # - # - # - # text1 - # - # - # text2 - # - # - # - # - -The tag and the tag are at the same level: they're both direct -children of the same tag. We call them `siblings`. When a document is -pretty-printed, siblings show up at the same indentation level. You -can also use this relationship in the code you write. - -``.next_sibling`` and ``.previous_sibling`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can use ``.next_sibling`` and ``.previous_sibling`` to navigate -between page elements that are on the same level of the parse tree:: - - sibling_soup.b.next_sibling - # text2 - - sibling_soup.c.previous_sibling - # text1 - -The tag has a ``.next_sibling``, but no ``.previous_sibling``, -because there's nothing before the tag `on the same level of the -tree`. For the same reason, the tag has a ``.previous_sibling`` -but no ``.next_sibling``:: - - print(sibling_soup.b.previous_sibling) - # None - print(sibling_soup.c.next_sibling) - # None - -The strings "text1" and "text2" are `not` siblings, because they don't -have the same parent:: - - sibling_soup.b.string - # u'text1' - - print(sibling_soup.b.string.next_sibling) - # None - -In real documents, the ``.next_sibling`` or ``.previous_sibling`` of a -tag will usually be a string containing whitespace. Going back to the -"three sisters" document:: - - Elsie - Lacie - Tillie - -You might think that the ``.next_sibling`` of the first tag would -be the second tag. But actually, it's a string: the comma and -newline that separate the first tag from the second:: - - link = soup.a - link - # Elsie - - link.next_sibling - # u',\n' - -The second tag is actually the ``.next_sibling`` of the comma:: - - link.next_sibling.next_sibling - # Lacie - -.. _sibling-generators: - -``.next_siblings`` and ``.previous_siblings`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can iterate over a tag's siblings with ``.next_siblings`` or -``.previous_siblings``:: - - for sibling in soup.a.next_siblings: - print(repr(sibling)) - # u',\n' - # Lacie - # u' and\n' - # Tillie - # u'; and they lived at the bottom of a well.' - # None - - for sibling in soup.find(id="link3").previous_siblings: - print(repr(sibling)) - # ' and\n' - # Lacie - # u',\n' - # Elsie - # u'Once upon a time there were three little sisters; and their names were\n' - # None - -Going back and forth --------------------- - -Take a look at the beginning of the "three sisters" document:: - - The Dormouse's story -

The Dormouse's story

- -An HTML parser takes this string of characters and turns it into a -series of events: "open an tag", "open a tag", "open a - tag", "add a string", "close the <title> tag", "open a <p> -tag", and so on. Beautiful Soup offers tools for reconstructing the -initial parse of the document. - -.. _element-generators: - -``.next_element`` and ``.previous_element`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``.next_element`` attribute of a string or tag points to whatever -was parsed immediately afterwards. It might be the same as -``.next_sibling``, but it's usually drastically different. - -Here's the final <a> tag in the "three sisters" document. Its -``.next_sibling`` is a string: the conclusion of the sentence that was -interrupted by the start of the <a> tag.:: - - last_a_tag = soup.find("a", id="link3") - last_a_tag - # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> - - last_a_tag.next_sibling - # '; and they lived at the bottom of a well.' - -But the ``.next_element`` of that <a> tag, the thing that was parsed -immediately after the <a> tag, is `not` the rest of that sentence: -it's the word "Tillie":: - - last_a_tag.next_element - # u'Tillie' - -That's because in the original markup, the word "Tillie" appeared -before that semicolon. The parser encountered an <a> tag, then the -word "Tillie", then the closing </a> tag, then the semicolon and rest of -the sentence. The semicolon is on the same level as the <a> tag, but the -word "Tillie" was encountered first. - -The ``.previous_element`` attribute is the exact opposite of -``.next_element``. It points to whatever element was parsed -immediately before this one:: - - last_a_tag.previous_element - # u' and\n' - last_a_tag.previous_element.next_element - # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> - -``.next_elements`` and ``.previous_elements`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You should get the idea by now. You can use these iterators to move -forward or backward in the document as it was parsed:: - - for element in last_a_tag.next_elements: - print(repr(element)) - # u'Tillie' - # u';\nand they lived at the bottom of a well.' - # u'\n\n' - # <p class="story">...</p> - # u'...' - # u'\n' - # None - -Searching the tree -================== - -Beautiful Soup defines a lot of methods for searching the parse tree, -but they're all very similar. I'm going to spend a lot of time explain -the two most popular methods: ``find()`` and ``find_all()``. The other -methods take almost exactly the same arguments, so I'll just cover -them briefly. - -Once again, I'll be using the "three sisters" document as an example:: - - html_doc = """ - <html><head><title>The Dormouse's story - -

The Dormouse's story

- -

Once upon a time there were three little sisters; and their names were - Elsie, - Lacie and - Tillie; - and they lived at the bottom of a well.

- -

...

- """ - - from bs4 import BeautifulSoup - soup = BeautifulSoup(html_doc) - -By passing in a filter to an argument like ``find_all()``, you can -isolate whatever parts of the document you're interested. - -Kinds of filters ----------------- - -Before talking in detail about ``find_all()`` and similar methods, I -want to show examples of different filters you can pass into these -methods. These filters show up again and again, throughout the -search API. You can use them to filter based on a tag's name, -on its attributes, on the text of a string, or on some combination of -these. - -.. _a string: - -A string -^^^^^^^^ - -The simplest filter is a string. Pass a string to a search method and -Beautiful Soup will perform a match against that exact string. This -code finds all the tags in the document:: - - soup.find_all('b') - # [The Dormouse's story] - -.. _a regular expression: - -A regular expression -^^^^^^^^^^^^^^^^^^^^ - -If you pass in a regular expression object, Beautiful Soup will filter -against that regular expression. This code finds all the tags whose -names start with the letter "b"; in this case, the tag and the - tag:: - - import re - for tag in soup.find_all(re.compile("b.*")): - print(tag.name) - # body - # b - -.. _a list: - -A list -^^^^^^ - -If you pass in a list, Beautiful Soup will allow a string match -against `any` item in that list. This code finds all the tags -`and` all the tags:: - - soup.find_all(["a", "b"]) - # [The Dormouse's story, - # Elsie, - # Lacie, - # Tillie] - -.. _the value True: - -``True`` -^^^^^^^^ - -The value ``True`` matches everything it can. This code finds `all` -the tags in the document, but none of the text strings:: - - for tag in soup.find_all(True): - print(tag.name) - # html - # head - # title - # body - # p - # b - # p - # a - # a - # a - # p - -.. a function: - -A function -^^^^^^^^^^ - -If none of the other matches work for you, define a function that -takes an element as its only argument. The function should return -``True`` if the argument matches, and ``False`` otherwise. - -Here's a function that returns ``True`` if a tag defines the "class" -attribute but doesn't define the "id" attribute:: - - def has_class_but_no_id(tag): - return tag.has_key('class') and not tag.has_key('id') - -Pass this function into ``find_all()`` and you'll pick up all the

-tags:: - - soup.find_all(has_class_but_no_id) - # [

The Dormouse's story

, - #

Once upon a time there were...

, - #

...

] - -This function only picks up the

tags. It doesn't pick up the -tags, because those tags define both "class" and "id". It doesn't pick -up tags like and , because those tags don't define -"class". - -Here's a function that returns ``True`` if a tag is surrounded by -string objects:: - - from bs4 import NavigableString - def surrounded_by_strings(tag): - return (isinstance(tag.next_element, NavigableString) - and isinstance(tag.previous_element, NavigableString)) - - for tag in soup.find_all(surrounded_by_strings): - print tag.name - # p - # a - # a - # a - # p - -Now we're ready to look at the search methods in detail. - -``find_all()`` --------------- - -Signature: find_all(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive -<recursive>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) - -The ``find_all()`` method looks through a tag's descendants and -retrieves `all` descendants that match your filters. I gave several -examples in `Kinds of filters`_, but here are a few more:: - - soup.find_all("title") - # [<title>The Dormouse's story] - - soup.find_all("p", "title") - # [

The Dormouse's story

] - - soup.find_all("a") - # [Elsie, - # Lacie, - # Tillie] - - soup.find_all(id="link2") - # [Lacie] - - import re - soup.find(text=re.compile("sisters")) - # u'Once upon a time there were three little sisters; and their names were\n' - -Some of these should look familiar, but others are new. What does it -mean to pass in a value for ``text``, or ``id``? Why does -``find_all("p", "title")`` find a

tag with the CSS class "title"? -Let's look at the arguments to ``find_all()``. - -.. _name: - -The ``name`` argument -^^^^^^^^^^^^^^^^^^^^^ - -Pass in a value for ``name`` and you'll tell Beautiful Soup to only -consider tags with certain names. Text strings will be ignored, as -will tags whose names that don't match. - -This is the simplest usage:: - - soup.find_all("title") - # [The Dormouse's story] - -Recall from `Kinds of filters`_ that the value to ``name`` can be `a -string`_, `a regular expression`_, `a list`_, `a function`_, or `the value -True`_. - -.. _kwargs: - -The keyword arguments -^^^^^^^^^^^^^^^^^^^^^ - -Any argument that's not recognized will be turned into a filter on tag -attributes. If you pass in a value for an argument called ``id``, -Beautiful Soup will filter against the tag's 'id' attribute:: - - soup.find_all(id='link2') - # [Lacie] - -If you pass in a value for ``href``, Beautiful Soup will filter -against the tag's 'href' attribute:: - - soup.find_all(href=re.compile("elsie")) - # [Elsie] - -You can filter an attribute based on `a string`_, `a regular -expression`_, `a list`_, `a function`_, or `the value True`_. - -This code finds all tags that have an ``id`` attribute, regardless of -what the value is:: - - soup.find_all(id=True) - # [Elsie, - # Lacie, - # Tillie] - -You can filter multiple attributes at once by passing in more than one -keyword argument:: - - soup.find_all(href=re.compile("elsie"), id='link1') - # [three] - -.. _attrs: - -``attrs`` and searching by CSS class -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Instead of using keyword arguments, you can filter tags based on their -attributes passing a dictionary in for ``attrs``. These two lines of -code are equivalent:: - - soup.find_all(href=re.compile("elsie"), id='link1') - soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}) - -The ``attrs`` argument would be a pretty obscure feature were it not for -one thing: CSS. It's very useful to search for a tag that has a -certain CSS class, but the name of the CSS attribute, "class", is also a -Python reserved word. - -You can use ``attrs`` to search by CSS class:: - - soup.find_all("a", { "class" : "sister" }) - # [Elsie, - # Lacie, - # Tillie] - -But that's a lot of code for such a common operation. Instead, you can -pass a string for `attrs` instead of a dictionary. The string will be -used to restrict the CSS class:: - - soup.find_all("a", "sister") - # [Elsie, - # Lacie, - # Tillie] - -.. _text: - -The ``text`` argument -^^^^^^^^^^^^^^^^^^^^^ - -With ``text`` you can search for strings instead of tags. As with -``name`` and the keyword arguments, you can pass in `a string`_, `a -regular expression`_, `a list`_, `a function`_, or `the value True`_. -Here are some examples:: - - soup.find_all(text="Elsie") - # [u'Elsie'] - - soup.find_all(text=["Tillie", "Elsie", "Lacie"]) - # [u'Elsie', u'Lacie', u'Tillie'] - - soup.find_all(text=re.compile("Dormouse")) - [u"The Dormouse's story", u"The Dormouse's story"] - - def is_the_only_string_within_a_tag(s): - """Return True if this string is the only child of its parent tag.""" - return (s == s.parent.string) - - soup.find_all(text=is_the_only_string_within_a_tag) - # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] - -.. _limit: - -The ``limit`` argument -^^^^^^^^^^^^^^^^^^^^^^ - -``find_all()`` returns all the tags and strings that match your -filters. This can take a while if the document is large. If you don't -need `all` the results, you can pass in a number for ``limit``. This -works just like the LIMIT keyword in SQL. It tells Beautiful Soup to -stop gathering results after it's found a certain number. - -There are three links in the "three sisters" document, but this code -only finds the first two:: - - soup.find_all("a", limit=2) - # [Elsie, - # Lacie] - -.. _recursive: - -The ``recursive`` argument -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If you call ``mytag.find_all()``, Beautiful Soup will examine all the -descendants of ``mytag``: its children, its children's children, and -so on. If you only want Beautiful Soup to consider direct children, -you can pass in ``recursive=False``. See the difference here:: - - soup.html.find_all("title") - # [The Dormouse's story] - - soup.html.find_all("title", recursive=False) - # [] - -Here's that part of the document:: - - - - - The Dormouse's story - - - ... - -The tag is beneath the <html> tag, but it's not `directly` -beneath the <html> tag: the <head> tag is in the way. Beautiful Soup -finds the <title> tag when it's allowed to look at all descendants of -the <html> tag, but when ``recursive=False`` restricts it to the -<html> tag's immediate children, it finds nothing. - -Beautiful Soup offers a lot of tree-searching methods (covered below), -and they mostly take the same arguments as ``find_all()``: ``name``, -``attrs``, ``text``, ``limit``, and the keyword arguments. But the -``recursive`` argument is different: ``find_all()`` and ``find()`` are -the only methods that support it. Passing ``recursive=False`` into a -method like ``find_parents()`` wouldn't be very useful. - -Calling a tag is like calling ``find_all()`` --------------------------------------------- - -Because ``find_all()`` is the most popular method in the Beautiful -Soup search API, you can use a shortcut for it. If you treat the -``BeautifulSoup`` object or a ``Tag`` object as though it were a -function, then it's the same as calling ``find_all()`` on that -object. These two lines of code are equivalent:: - - soup.find_all("a") - soup("a") - -These two lines are also equivalent:: - - soup.title.find_all(text=True) - soup.title(text=True) - -``find()`` ----------- - -Signature: find(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive -<recursive>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) - -The ``find_all()`` method scans the entire document looking for -results, but sometimes you only want to find one result. If you know a -document only has one <body> tag, it's a waste of time to scan the -entire document looking for more. Rather than passing in ``limit=1`` -every time you call ``find_all``, you can use the ``find()`` -method. These two lines of code are `nearly` equivalent:: - - soup.find_all('title', limit=1) - # [<title>The Dormouse's story] - - soup.find('title') - # The Dormouse's story - -The only difference is that ``find_all()`` returns a list containing -the single result, and ``find()`` just returns the result. - -If ``find_all()`` can't find anything, it returns an empty list. If -``find()`` can't find anything, it returns ``None``:: - - print(soup.find("nosuchtag")) - # None - -Remember the ``soup.head.title`` trick from `Navigating using tag -names`_? That trick works by repeatedly calling ``find()``:: - - soup.head.title - # The Dormouse's story - - soup.find("head").find("title") - # The Dormouse's story - -``find_parents()`` and ``find_parent()`` ----------------------------------------- - -Signature: find_parents(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) - -Signature: find_parent(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) - -I spent a lot of time above covering ``find_all()`` and -``find()``. The Beautiful Soup API defines ten other methods for -searching the tree, but don't be afraid. Five of these methods are -basically the same as ``find_all()``, and the other five are basically -the same as ``find()``. The only differences are in what parts of the -tree they search. - -First let's consider ``find_parents()`` and -``find_parent()``. Remember that ``find_all()`` and ``find()`` work -their way down the tree, looking at tag's descendants. These methods -do the opposite: they work their way `up` the tree, looking at a tag's -(or a string's) parents. Let's try them out, starting from a string -buried deep in the "three daughters" document:: - - a_string = soup.find(text="Lacie") - a_string - # u'Lacie' - - a_string.find_parents("a") - # [Lacie] - - a_string.find_parent("p") - #

Once upon a time there were three little sisters; and their names were - # Elsie, - # Lacie and - # Tillie; - # and they lived at the bottom of a well.

- - a_string.find_parents("p", class="title") - # [] - -One of the three tags is the direct parent of the string in -question, so our search finds it. One of the three

tags is an -indirect parent of the string, and our search finds that as -well. There's a

tag with the CSS class "title" `somewhere` in the -document, but it's not one of this string's parents, so we can't find -it with ``find_parents()``. - -You may have made the connection between ``find_parent()`` and -``find_parents()``, and the `.parent`_ and `.parents`_ attributes -mentioned earlier. The connection is very strong. These search methods -actually use ``.parents`` to iterate over all the parents, and check -each one against the provided filter to see if it matches. - -``find_next_siblings()`` and ``find_next_sibling()`` ----------------------------------------------------- - -Signature: find_next_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) - -Signature: find_next_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) - -These methods use :ref:`.next_siblings ` to -iterate over the rest of an element's siblings in the tree. The -``find_next_siblings()`` method returns all the siblings that match, -and ``find_next_sibling()`` only returns the first one:: - - first_link = soup.a - first_link - # Elsie - - first_link.find_next_siblings("a") - # [Lacie, - # Tillie] - - first_story_paragraph = soup.find("p", "story") - first_story_paragraph.find_next_sibling("p") - #

...

- -``find_previous_siblings()`` and ``find_previous_sibling()`` ------------------------------------------------------------- - -Signature: find_previous_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) - -Signature: find_previous_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) - -These methods use :ref:`.previous_siblings ` to iterate over an element's -siblings that precede it in the tree. The ``find_previous_siblings()`` -method returns all the siblings that match, and -``find_previous_sibling()`` only returns the first one:: - - last_link = soup.find("a", id="link3") - last_link - # Tillie - - last_link.find_previous_siblings("a") - # [Lacie, - # Elsie] - - first_story_paragraph = soup.find("p", "story") - first_story_paragraph.find_previous_sibling("p") - #

The Dormouse's story

- - -``find_all_next()`` and ``find_next()`` ---------------------------------------- - -Signature: find_all_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) - -Signature: find_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) - -These methods use :ref:`.next_elements ` to -iterate over whatever tags and strings that come after it in the -document. The ``find_all_next()`` method returns all matches, and -``find_next()`` only returns the first match:: - - first_link = soup.a - first_link - # Elsie - - first_link.find_all_next(text=True) - # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', - # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] - - first_link.find_next("p") - #

...

- -In the first example, the string "Elsie" showed up, even though it was -contained within the tag we started from. In the second example, -the last

tag in the document showed up, even though it's not in -the same part of the tree as the tag we started from. For these -methods, all that matters is that an element match the filter, and -show up later in the document than the starting element. - -``find_all_previous()`` and ``find_previous()`` ------------------------------------------------ - -Signature: find_all_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) - -Signature: find_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) - -These methods use :ref:`.previous_elements ` to -iterate over the tags and strings that came before it in the -document. The ``find_all_previous()`` method returns all matches, and -``find_previous()`` only returns the first match:: - - first_link = soup.a - first_link - # Elsie - - first_link.find_all_previous("p") - # [

Once upon a time there were three little sisters; ...

, - #

The Dormouse's story

] - - first_link.find_previous("title") - # The Dormouse's story - -The call to ``find_all_previous("p")`` found the first paragraph in -the document (the one with class="title"), but it also finds the -second paragraph, the

tag that contains the tag we started -with. This shouldn't be too surprising: we're looking at all the tags -that show up earlier in the document than the one we started with. A -

tag that contains an tag must have shown up earlier in the -document. - -Modifying the tree -================== - -Beautiful Soup's main strength is in searching the parse tree, but you -can also modify the tree and write your changes as a new HTML or XML -document. - -Changing tag names and attributes ---------------------------------- - -I covered this earlier, in `Attributes`_, but it bears repeating. You -can rename a tag, change the values of its attributes, add new -attributes, and delete attributes:: - - soup = BeautifulSoup('Extremely bold') - tag = soup.b - - tag.name = "blockquote" - tag['class'] = 'verybold' - tag['id'] = 1 - tag - #

Extremely bold
- - del tag['class'] - del tag['id'] - tag - #
Extremely bold
- - -Modifying ``.string`` ---------------------- - -If you set a tag's ``.string`` attribute, the tag's contents are -replaced with the string you give:: - - markup = '
I linked to example.com' - soup = BeautifulSoup(markup) - - tag = soup.a - tag.string = "New link text." - tag - # New link text. - -Be careful: if the tag contained other tags, they and all their -contents will be destroyed. - -``append()`` ------------- - -You can add to a tag's contents with ``Tag.append()``. It works just -like calling ``.append()`` on a Python list:: - - soup = BeautifulSoup("Foo") - soup.a.append("Bar") - - soup - # FooBar - soup.a.contents - # [u'Foo', u'Bar'] - -``BeautifulSoup.new_string()`` and ``.new_tag()`` ------------------------------------------------- - -If you need to add a string to a document, no problem--you can pass a -Python string in to ``append()``, or you can call the factory method -``BeautifulSoup.new_string()``:: - - soup = BeautifulSoup("") - tag = soup.b - tag.append("Hello") - new_string = soup.new_string(" there") - tag.append(new_string) - tag - # Hello there. - tag.contents - # [u'Hello', u' there'] - -What if you need to create a whole new tag? The best solution is to -call the factory method ``BeautifulSoup.new_tag()``:: - - soup = BeautifulSoup("") - original_tag = soup.b - - new_tag = soup.new_tag("a", href="http://www.example.com") - original_tag.append(new_tag) - original_tag - # - - new_tag.string = "Link text." - original_tag - # Link text. - -Only the first argument, the tag name, is required. - -``insert()`` ------------- - -``Tag.insert()`` is just like ``Tag.append()``, except the new element -doesn't necessarily go at the end of its parent's -``... contents``. It'll be inserted at whatever numeric position you -say. It works just like ``.insert()`` on a Python list:: - - markup = 'I linked to example.com' - soup = BeautifulSoup(markup) - tag = soup.a - - tag.insert(1, "but did not endorse ") - tag - # I linked to but did not endorse example.com - tag.contents - # [u'I linked to ', u'but did not endorse', example.com] - -``move_before()`` and ``move_after()`` ------------------------------------------- - -The ``move_before()`` method moves a tag or string so that it -immediately precedes something else in the parse tree:: - - soup = BeautifulSoup("stop") - tag = soup.new_tag("i") - tag.string = "Don't" - tag.move_before(soup.b.string) - soup.b - # Don'tstop - -The ``move_after()`` method moves a tag or string so that it -immediately follows something else in the parse tree:: - - soup.new_string(" ever ").move_after(soup.b.i) - soup.b - # Don't ever stop - soup.b.contents - # [Don't, u' ever ', u'stop'] - -``clear()`` ------------ - -``Tag.clear()`` removes the contents of a tag:: - - markup = 'I linked to example.com' - soup = BeautifulSoup(markup) - tag = soup.a - - tag.clear() - tag - # - -``extract()`` -------------- - -``PageElement.extract()`` removes a tag or string from the tree. It -returns the tag or string that was extracted:: - - markup = 'I linked to example.com' - soup = BeautifulSoup(markup) - a_tag = soup.a - - i_tag = soup.i.extract() - - a_tag - # I linked to - - i_tag - # example.com - - print(i_tag.parent) - None - -At this point you effectively have two parse trees: one rooted at the -``BeautifulSoup`` object you used to parse the document, and one rooted -at the tag that was extracted. You can go on to call ``extract`` on -a child of the element you extracted:: - - my_string = i_tag.string.extract() - my_string - # u'example.com' - - print(my_string.parent) - # None - i_tag - # - - -``decompose()`` ---------------- - -``Tag.decompose()`` removes a tag from the tree, then `completely -destroys it and its contents`:: - - markup = 'I linked to example.com' - soup = BeautifulSoup(markup) - a_tag = soup.a - - soup.i.decompose() - - a_tag - # I linked to - - -.. _replace_with: - -``replace_with()`` ------------------- - -``PageElement.replace_with()`` removes a tag or string from the tree, -and replaces it with the tag or string of your choice:: - - markup = 'I linked to example.com' - soup = BeautifulSoup(markup) - a_tag = soup.a - - new_tag = soup.new_tag("b") - new_tag.string = "example.net" - a_tag.i.replace_with(new_tag) - - a_tag - # I linked to example.net - -``replace_with()`` returns the tag or string that was replaced, so -that you can examine it or add it back to another part of the tree. - -``replace_with_children()`` ---------------------------- - -``Tag.replace_with_children()`` replaces a tag with whatever's inside -that tag. It's good for stripping out markup:: - - markup = 'I linked to example.com' - soup = BeautifulSoup(markup) - a_tag = soup.a - - a_tag.i.replace_with_children() - a_tag - # I linked to example.com - -Like ``replace_with()``, ``replace_with_children()`` returns the tag -that was replaced. - -Output -====== - -Pretty-printing ---------------- - -The ``prettify()`` method will turn a Beautiful Soup parse tree into a -nicely formatted bytestring, with each HTML/XML tag on its own line:: - - markup = 'I linked to example.com' - soup = BeautifulSoup(markup) - soup.prettify() - # '\n \n \n \n \n...' - - print(soup.prettify()) - # - # - # - # - # - # I linked to - # - # example.com - # - # - # - # - -You can call ``prettify()`` on the top-level ``BeautifulSoup`` object, -or on any of its ``Tag`` objects:: - - print(soup.a.prettify()) - # - # I linked to - # - # example.com - # - # - -Non-pretty printing -------------------- - -If you just want a string, with no fancy formatting, you can call -``unicode()`` or ``str()`` on a ``BeautifulSoup`` object, or a ``Tag`` -within it:: - - str(soup) - # 'I linked to example.com' - - unicode(soup.a) - # u'I linked to example.com' - -The ``str()`` function returns a string encoded in UTF-8. See -`Encodings`_ for other options. - -You can also call ``encode()`` to get a bytestring, and ``decode()`` -to get Unicode. - -Output formatters ------------------ - -If you give Beautiful Soup a document that contains HTML entities like -"&lquot;", they'll be converted to Unicode characters:: - - soup = BeautifulSoup("“Hello,” he said.") - unicode(soup) - # u'\u201cHello,\u201d he said.' - -If you then convert the document to a string, the Unicode characters -will be encoded as UTF-8. You won't get the HTML entities back: - - str(soup) - # '\xe2\x80\x9cHello,\xe2\x80\x9d he said.' - -By default, the only characters that are escaped upon output are bare -ampersands and angle brackets. These get turned into "&", "<", -and ">", so that Beautiful Soup doesn't inadvertently generate -invalid HTML or XML:: - - soup = BeautifulSoup("

The law firm of Dewey, Cheatem, & Howe

") - soup.p - #

The law firm of Dewey, Cheatem, & Howe

- -You can change this behavior by providing a value for the -``formatter`` argument to ``prettify()``, ``encode()``, or -``decode()``. Beautiful Soup recognizes four possible values for -``formatter`` - -The default is ``formatter="minimal"``. Strings will only be processed -enough to ensure that Beautiful Soup generates valid HTML/XML:: - - french = "

Il a dit <<Sacré bleu!>>

" - soup = BeautifulSoup(french) - print(soup.prettify(formatter="minimal")) - # - # - #

- # Il a dit <<Sacré bleu!>> - #

- # - # - -``formatter="html"`` will convert Unicode characters to HTML entities -whenever possible:: - - print(soup.prettify(formatter="html")) - # - # - #

- # Il a dit <<Sacré bleu!>> - #

- # - # - -If you pass in ``formatter=None``, Beautiful Soup will not modify -strings at all on output. This is the fastest option, but it may lead -to Beautiful Soup generating invalid HTML/XML, as in this example:: - - print(soup.prettify(formatter=None)) - # - # - #

- # Il a dit <> - #

- # - # - - -Finally, if you pass in a function for ``formatter``, Beautiful Soup -will call that function once for every string in the document. You can -do whatever you want in this function. Here's a formatter that -converts strings to uppercase and does absolutely nothing else:: - - def uppercase(str): - return str.upper() - - print(soup.prettify(formatter=uppercase)) - # - # - #

- # IL A DIT <> - #

- # - # - -If you're writing your own function, you should know about the -``EntitySubstitution`` class in the ``bs4.dammit`` module. This class -implements Beautiful Soup's standard formatters as class methods: the -"html" formatter is ``EntitySubstitution.substitute_html``, and the -"minimal" formatter is ``EntitySubstitution.substitute_xml``. You can -use these functions to simulate ``formatter=html`` or -``formatter==minimal`` but and then do something in addition. - -Here's an example that converts strings to uppercase, ``and`` replaces -Unicode characters with HTML entities whenever possible:: - - from bs4.dammit import EntitySubstitution - def uppercase_and_substitute_html_entities(str): - return EntitySubstitution.substitute_html(str.upper()) - - print(soup.prettify(formatter=uppercase_and_substitute_html_entities)) - # - # - #

- # IL A DIT <<SACRÉ BLEU!>> - #

- # - # - -``get_text()`` --------------- - -If you only want the text part of a document or tag, you can use the -``get_text()`` method. It returns all the text in a document or -beneath a tag, as a single Unicode string:: - - markup = '\nI linked to example.com\n' - soup = BeautifulSoup(markup) - - soup.get_text() - u'\nI linked to example.com\n' - soup.i.get_text() - u'example.com' - -You can specify a string to be used to join the bits of text -together:: - - # soup.get_text("|") - u'\nI linked to |example.com|\n' - -You can tell Beautiful Soup to strip whitespace from the beginning and -end of each bit of text:: - - # soup.get_text("|", strip=True) - u'I linked to|example.com' - -But at that point you might want to use the :ref:`.stripped_strings ` -generator instead, and process the text yourself:: - - [text for text in soup.stripped_strings] - # [u'I linked to', u'example.com'] - -Choosing a parser -================= - -If you just need to parse some HTML, you can dump the markup into the -``BeautifulSoup`` constructor, and it'll probably be fine. Beautiful -Soup will pick a parser for you and parse the data. But there are a -few additional arguments you can pass in to the constructor to change -which parser is used. - -The first argument to the ``BeautifulSoup`` constructor is a string or -an open filehandle--the markup you want parsed. The second argument is -`how` you'd like the markup parsed. - -If you don't specify anything, you'll get the best HTML parser that's -installed. Beautiful Soup ranks lxml's parser as being the best, then -html5lib's, then Python's built-in parser. You can override this by -specifying one of the following: - -* What type of markup you want to parse. Currently supported are - "html", "xml", and "html5". - -* The name of the parser library you want to use. Currently supported - options are "lxml", "html5lib", and "html.parser" (Python's - built-in HTML parser). - -Some examples:: - - BeautifulSoup(markup, "lxml") - BeautifulSoup(markup, "xml") - BeautifulSoup(markup, "html5") - -You can specify a list of the parser features you want, instead of -just one. Right now this is mostly useful for distinguishing between -lxml's HTML parser and its XML parser:: - - BeautifulSoup(markup, ["html", "lxml"]) - BeautifulSoup(markup, ["xml", "lxml"]) - -If you don't have an appropriate parser installed, Beautiful Soup will -ignore your request and pick a different parser. For instance, right -now the only supported XML parser is lxml, so if you don't have lxml -installed, asking for an XML parser won't give you one, and asking for -"lxml" won't work either. - -Why would you use one parser over another? Because different parsers -will create different parse trees from the same document. The biggest -differences are between HTML parsers and XML parsers. Here's a short -document, parsed as HTML:: - - BeautifulSoup("") - # - -Since an empty tag is not valid HTML, the parser turns it into a - tag pair. - -Here's the same document parsed as XML (running this requires that you -have lxml installed). Note that the empty tag is left alone, and -that the document is given an XML declaration instead of being put -into an tag.:: - - BeautifulSoup("", "xml") - # - # - -There are also differences between HTML parsers. If you give Beautiful -Soup a perfectly-formed HTML document, these differences won't -matter. One parser may be faster than another, but they'll all give -you a data structure that looks exactly like the original HTML -document. - -But if the document is not perfectly-formed, different parsers will -give different results. Here's a short, invalid document parsed using -lxml's HTML parser. Note that the dangling

tag is simply -ignored:: - - BeautifulSoup("

", "lxml") - #
- -Here's the same document parsed using html5lib:: - - BeautifulSoup("

", "html5lib") - #

- -Instead of ignoring the dangling

tag, html5lib pairs it with an -opening

tag. This parser also adds an empty tag to the -document. - -Here's the same document parsed with Python's built-in HTML -parser:: - - BeautifulSoup("

", "html.parser") - # - -Like html5lib, this parser ignores the closing

tag. Unlike -html5lib, this parser makes no attempt to create a well-formed HTML -document by adding a tag. Unlike lxml, it doesn't even bother -to add an tag. - -Since the document "

" is invalid, none of these techniques is -the "correct" way to handle it. The html5lib parser uses techniques -that are part of the HTML5 standard, so it has the best claim on being -the "correct" way, but all three techniques are leigtimate. - -Differences between parsers can affect your script. If you're planning -on distributing your script to other people, you might want to specify -in the ``BeautifulSoup`` constructor which parser you used during -development. That will reduce the chances that your users parse a -document differently from the way you parse it. - - -Encodings -========= - -Any HTML or XML document is written in a specific encoding like ASCII -or UTF-8. But when you load that document into Beautiful Soup, you'll -discover it's been converted to Unicode:: - - markup = "

Sacr\xc3\xa9 bleu!

" - soup = BeautifulSoup(markup) - soup.h1 - #

Sacré bleu!

- soup.h1.string - # u'Sacr\xe9 bleu!' - -It's not magic. (That sure would be nice.) Beautiful Soup uses a -sub-library called `Unicode, Dammit`_ to detect a document's encoding -and convert it to Unicode. The autodetected encoding is available as -the ``.original_encoding`` attribute of the ``BeautifulSoup`` object:: - - soup.original_encoding - 'utf-8' - -Unicode, Dammit guesses correctly most of the time, but sometimes it -makes mistakes. Sometimes it guesses correctly, but only after a -byte-by-byte search of the document that takes a very long time. If -you happen to know a document's encoding ahead of time, you can avoid -mistakes and delays by passing it to the ``BeautifulSoup`` constructor -as ``from_encoding``. - -Here's a document written in ISO-8859-8. The document is so short that -Unicode, Dammit can't get a good lock on it, and misidentifies it as -ISO-8859-7:: - - markup = b"

\xed\xe5\xec\xf9

" - soup = BeautifulSoup(markup) - soup.h1 -

νεμω

- soup.original_encoding - 'ISO-8859-7' - -We can fix this by passing in the correct ``from_encoding``:: - - soup = BeautifulSoup(markup, from_encoding="iso-8859-8") - soup.h1 -

םולש

- soup.original_encoding - 'iso8859-8' - -Output encoding ---------------- - -When you write out a document from Beautiful Soup, you get a UTF-8 -document, even if the document wasn't in UTF-8 to begin with. Here's a -document written in the Latin-1 encoding:: - - markup = b''' - - - - - -

Sacr\xe9 bleu!

- - - ''' - - soup = BeautifulSoup(markup) - print(soup.prettify()) - # - # - # - # - # - #

- # Sacré bleu! - #

- # - # - -Note that the tag has been rewritten to reflect the fact that -the document is now in UTF-8. - -If you don't want UTF-8, you can pass an encoding into ``prettify()``:: - - print(soup.prettify("latin-1")) - # - # - # - # ... - -You can also call encode() on the ``BeautifulSoup`` object, or any -element in the soup, just as if it were a Python string:: - - soup.p.encode("latin-1") - # '

Sacr\xe9 bleu!

' - - soup.p.encode("utf-8") - # '

Sacr\xc3\xa9 bleu!

' - -Unicode, Dammit ---------------- - -You can use Unicode, Dammit without using Beautiful Soup. It's useful -whenever you have data in an unknown encoding and you just want it to -become Unicode:: - - from bs4 import UnicodeDammit - dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") - print(dammit.unicode_markup) - # Sacré bleu! - dammit.original_encoding - # 'utf-8' - -The more data you give Unicode, Dammit, the more accurately it will -guess. If you have your own suspicions as to what the encoding might -be, you can pass them in as a list:: - - dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) - print(dammit.unicode_markup) - # Sacré bleu! - dammit.original_encoding - # 'latin-1' - -Unicode, Dammit has one special feature that Beautiful Soup doesn't -use. You can use it to convert Microsoft smart quotes to HTML or XML -entities:: - - markup = b"

I just \x93love\x94 Microsoft Word

" - - UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup - # u'

I just “love” Microsoft Word

' - - UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup - # u'

I just “love” Microsoft Word

' - -You might find this feature useful, but Beautiful Soup doesn't use -it. Beautiful Soup prefers the default behavior, which is to convert -Microsoft smart quotes to Unicode characters along with everything -else:: - - UnicodeDammit(markup, ["windows-1252"]).unicode_markup - # u'

I just \u201clove\u201d Microsoft Word

' - -Parsing only part of a document -=============================== - -Let's say you want to use Beautiful Soup look at a document's
-tags. It's a waste of time and memory to parse the entire document and -then go over it again looking for tags. It would be much faster to -ignore everthing that wasn't an tag in the first place. The -``SoupStrainer`` class allows you to choose which parts of an incoming -document are parsed. You just create a ``SoupStrainer`` and pass it in -to the ``BeautifulSoup`` constructor as the ``parse_only`` argument. - -(Note that *this feature won't work if you're using the html5lib -parser*. If you use html5lib, the whole document will be parsed, no -matter what. In the examples below, I'll be forcing Beautiful Soup to -use Python's built-in parser.) - -``SoupStrainer`` ----------------- - -The ``SoupStrainer`` class takes the same arguments as a typical -method from `Searching the tree`_: :ref:`name `, :ref:`attrs -`, :ref:`text `, and :ref:`**kwargs `. Here are -three ``SoupStrainer`` objects:: - - from bs4 import SoupStrainer - - only_a_tags = SoupStrainer("a") - - only_tags_with_id_link2 = SoupStrainer(id="link2") - - def is_short_string(string): - return len(string) < 10 - - only_short_strings = SoupStrainer(text=is_short_string) - -I'm going to bring back the "three sisters" document one more time, -and we'll see what the document looks like when it's parsed with these -three ``SoupStrainer`` objects:: - - html_doc = """ - The Dormouse's story - -

The Dormouse's story

- -

Once upon a time there were three little sisters; and their names were - Elsie, - Lacie and - Tillie; - and they lived at the bottom of a well.

- -

...

- """ - - print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) - # - # Elsie - # - # - # Lacie - # - # - # Tillie - # - - print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) - # - # Lacie - # - - print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) - # Elsie - # , - # Lacie - # and - # Tillie - # ... - # - -You can also pass a ``SoupStrainer`` into any of the methods covered -in `Searching the tree`_. This probably isn't terribly useful, but I -thought I'd mention it:: - - soup = BeautifulSoup(html_doc) - soup.find_all(only_short_strings) - # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', - # u'\n\n', u'...', u'\n'] - -Troubleshooting -=============== - -Parsing XML ------------ - -By default, Beautiful Soup parses documents as HTML. To parse a -document as XML, pass in "xml" as the second argument to the -``BeautifulSoup`` constructor:: - - soup = BeautifulSoup(markup, "xml") - -You'll need to :ref:`have lxml installed `. - -Improving Performance ---------------------- - -Beautiful Soup will never be as fast as the parsers it sits on top -of. If response time is critical, if you're paying for computer time -by the hour, or if there's any other reason why computer time is more -valuable than programmer time, you should forget about Beautiful Soup -and work directly atop `lxml `_. - -That said, there are things you can do to speed up Beautiful Soup. If -you're not using lxml as the underlying parser, my advice is to -:ref:`start `. Beautiful Soup parses documents -significantly faster using lxml than using html.parser or html5lib. - -Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by -doing a byte-by-byte examination of the file. This slows Beautiful -Soup to a crawl. My tests indicate that this only happened on 2.x -versions of Python, and that it happened most often with documents -using Russian or Chinese encodings. If this is happening to you, you -can fix it by using Python 3 for your script. Or, if you happen to -know a document's encoding, you can pass it into the -``BeautifulSoup`` constructor as ``from_encoding``. - -`Parsing only part of a document`_ won't save you much time parsing -the document, but it can save a lot of memory, and it'll make -`searching` the document much faster. - -Beautiful Soup 3 -================ - -Beautiful Soup 3.2.0 is the old version, the last release of the -Beautiful Soup 3 series. It's currently the version packaged with all -major Linux distributions:: - -:kbd:`$ apt-get install python-beautifulsoup` - -It's also published through PyPi as `BeautifulSoup`.:: - -:kbd:`$ easy_install BeautifulSoup` -:kbd:`$ pip install BeautifulSoup` - -You can also `download a tarball of Beautiful Soup 3.2.0 -`_. - -If you ran ``easy_install beautifulsoup`` or ``easy_install -BeautifulSoup``, but your code doesn't work, you installed Beautiful -Soup 3 by mistake. You need to run ``easy_install beautifulsoup4``. - -`The documentation for Beautiful Soup 3 is archived online -`_. If -your first language is Chinese, it might be easier for you to read -`the Chinese translation of the Beautiful Soup 3 documentation -`_, -then read this document to find out about the changes made in -Beautiful Soup 4. - -Porting code to BS4 -------------------- - -Most code written against Beautiful Soup 3 will work against Beautiful -Soup 4 with one simple change. All you should have to do is change the -package name from ``BeautifulSoup`` to ``bs4``. So this:: - - from BeautifulSoup import BeautifulSoup - -becomes this:: - - from bs4 import BeautifulSoup - -* If you get the ``ImportError`` "No module named BeautifulSoup", your - problem is that you're trying to run Beautiful Soup 3 code, but you - only have Beautiful Soup 4 installed. - -* If you get the ``ImportError`` "No module named bs4", your problem - is that you're trying to run Beautiful Soup 4 code, but you only - have Beautiful Soup 3 installed. - -Although BS4 is mostly backwards-compatible with BS3, most of its -methods have been deprecated and given new names for `PEP 8 compliance -`_. There are numerous other -renames and changes, and a few of them break backwards compatibility. - -Here's what you'll need to know to convert your BS3 code and habits to BS4: - -You need a parser -^^^^^^^^^^^^^^^^^ - -Beautiful Soup 3 used Python's ``SGMLParser``, a module that was -deprecated and removed in Python 3.0. Beautiful Soup 4 uses -``html.parser`` by default, but you can plug in lxml or html5lib and -use that instead. Until ``html.parser`` is improved to handle -real-world HTML better, that's what I recommend you do. See `Be sure -to install a good parser!`_ - -Method names -^^^^^^^^^^^^ - -* ``replaceWith`` -> ``replace_with`` -* ``replaceWithChildren`` -> ``replace_with_children`` -* ``findAll`` -> ``find_all`` -* ``findAllNext`` -> ``find_all_next`` -* ``findAllPrevious`` -> ``find_all_previous`` -* ``findNext`` -> ``find_next`` -* ``findNextSibling`` -> ``find_next_sibling`` -* ``findNextSiblings`` -> ``find_next_siblings`` -* ``findParent`` -> ``find_parent`` -* ``findParents`` -> ``find_parents`` -* ``findPrevious`` -> ``find_previous`` -* ``findPreviousSibling`` -> ``find_previous_sibling`` -* ``findPreviousSiblings`` -> ``find_previous_siblings`` -* ``nextSibling`` -> ``next_sibling`` -* ``previousSibling`` -> ``previous_sibling`` - -Some arguments to the Beautiful Soup constructor were renamed for the -same reasons: - -* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` -* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` - -I renamed one method for compatibility with Python 3: - -* ``Tag.has_key()`` -> ``Tag.has_attr()`` - -I renamed one attribute to use more accurate terminology: - -* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` - -I renamed three attributes to avoid using words that have special -meaning to Python. Unlike the others, these changes are *not backwards -compatible.* If you used these attributes in BS3, your code will break -on BS4 until you change them. - -* ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` -* ``Tag.next`` -> ``Tag.next_element`` -* ``Tag.previous`` -> ``Tag.previous_element`` - -Generators -^^^^^^^^^^ - -I gave the generators PEP 8-compliant names, and transformed them into -properties: - -* ``childGenerator()`` -> ``children`` -* ``nextGenerator()`` -> ``next_elements`` -* ``nextSiblingGenerator()`` -> ``next_siblings`` -* ``previousGenerator()`` -> ``previous_elements`` -* ``previousSiblingGenerator()`` -> ``previous_siblings`` -* ``recursiveChildGenerator()`` -> ``descendants`` -* ``parentGenerator()`` -> ``parents`` - -So instead of this:: - - for parent in tag.parentGenerator(): - ... - -You can write this:: - - for parent in tag.parents: - ... - -(But the old code will still work.) - -Some of the generators used to yield ``None`` after they were done, and -then stop. That was a bug. Now the generators just stop. - -There are two new generators, :ref:`.strings and -.stripped_strings `. ``.strings`` yields -NavigableString objects, and ``.stripped_strings`` yields Python -strings that have had whitespace stripped. - -XML -^^^ - -There is no longer a ``BeautifulStoneSoup`` class for parsing XML. To -parse XML you pass in "xml" as the second argument to the -``BeautifulSoup`` constructor. For the same reason, the -``BeautifulSoup`` constructor no longer recognizes the ``isHTML`` -argument. - -Beautiful Soup's handling of empty-element XML tags has been -improved. Previously when you parsed XML you had to explicitly say -which tags were considered empty-element tags. The ``selfClosingTags`` -argument to the constructor is no longer recognized. Instead, -Beautiful Soup considers any empty tag to be an empty-element tag. If -you add a child to an empty-element tag, it stops being an -empty-element tag. - -Entities -^^^^^^^^ - -An incoming HTML or XML entity is always converted into the -corresponding Unicode character. Beautiful Soup 3 had a number of -overlapping ways of dealing with entities, which have been -removed. The ``BeautifulSoup`` constructor no longer recognizes the -``smartQuotesTo`` or ``convertEntities`` arguments. (`Unicode, -Dammit`_ still has ``smart_quotes_to``, but its default is now to turn -smart quotes into Unicode.) - -If you want to turn those Unicode characters back into HTML entities -on output, rather than turning them into UTF-8 characters, you need to -use ``.encode``, as described in `Substituting HTML entities`. This -may change before the final release. - -Miscellaneous -^^^^^^^^^^^^^ - -:ref:`Tag.string <.string>` now operates recursively. If tag A -contains a single tag B and nothing else, then A.string is the same as -B.string. (Previously, it was None.) - -The ``BeautifulSoup`` constructor no longer recognizes the -`markupMassage` argument. It's now the parser's responsibility to -handle markup correctly. - -The rarely-used alternate parser classes like -``ICantBelieveItsBeautifulSoup`` and ``BeautifulSOAP`` have been -removed. It's now the parser's decision how to handle ambiguous -markup. -- cgit v1.2.3

Once upon a time there were three little sisters; and their names were - Elsie, - Lacie and - Tillie; - and they lived at the bottom of a well.