From 6590573e18a533f30c9635ecbd6af163d6826ef8 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 7 Feb 2012 19:15:04 -0500 Subject: Documented today's changes. --- doc/source/index.rst | 158 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 141 insertions(+), 17 deletions(-) (limited to 'doc/source') diff --git a/doc/source/index.rst b/doc/source/index.rst index 1d9d54c..8e35f0e 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1464,13 +1464,25 @@ like calling ``.append()`` on a Python list:: soup.a.contents # [u'Foo', u'Bar'] -``BeautifulSoup.new_tag()`` ---------------------------- +``BeautifulSoup.new_tag()`` and ``new_string()`` +------------------------------------------------ + +If you need to add a string to a document, no problem--you can pass a +Python string in to ``append()``, or you can call the factory method +``BeautifulSoup.new_string()``:: -If you need to add a string to a document, no problem--you can just -pass a Python string in to ``append()``. But what if you need to -create a whole new tag? The best solution is to call the factory -method ``BeautifulSoup.new_tag()``:: + soup = BeautifulSoup("") + tag = soup.b + tag.append("Hello") + new_string = soup.new_string(" there") + tag.append(new_string) + tag + # Hello there. + tag.contents + # [u'Hello', u' there'] + +What if you need to create a whole new tag? The best solution is to +call the factory method ``BeautifulSoup.new_tag()``:: soup = BeautifulSoup("") original_tag = soup.b @@ -1504,6 +1516,28 @@ say. It works just like ``.insert()`` on a Python list:: tag.contents # [u'I linked to ', u'but did not endorse', example.com] +``insert_before()`` and ``insert_after()`` +------------------------------------------ + +The ``insert_before()`` method adds a tag or string to the parse tree +immediately before something else:: + + soup = BeautifulSoup("stop") + tag = soup.new_tag("i") + tag.string = "Don't" + tag.insert_before(soup.b.string) + soup.b + # Don'tstop + +The ``insert_after()`` method adds a tag or string to the parse tree +immediately `after` something else:: + + soup.new_string(" ever ").insert_after(soup.b.i) + soup.b + # Don't ever stop + soup.b.contents + # [Don't, u' ever ', u'stop'] + ``clear()`` ----------- @@ -1663,29 +1697,119 @@ within it:: The ``str()`` function returns a string encoded in UTF-8. See `Encodings`_ for other options. -Substituting HTML entities --------------------------- +You can also call ``encode()`` to get a bytestring, and ``decode()`` +to get Unicode. + +Output formatters +----------------- -If you give Beautiful Soup a document that contains HTML or XML -entities such as "&lquot;" they'll be converted to Unicode -characters:: +If you give Beautiful Soup a document that contains HTML entities like +"&lquot;", they'll be converted to Unicode characters:: soup = BeautifulSoup("“Hello,” he said.") unicode(soup) # u'\u201cHello,\u201d he said.' If you then convert the document to a string, the Unicode characters -will be encoded as UTF-8:: +will be encoded as UTF-8. You won't get the HTML entities back: str(soup) # '\xe2\x80\x9cHello,\xe2\x80\x9d he said.' -You can get the HTML entities back (or create them where they didn't -exist) by calling ``.encode()`` and passing in -``substitute_html_entities=True``:: +By default, the only characters that are escaped upon output are bare +ampersands and angle brackets. These get turned into "&", "<", +and ">", so that Beautiful Soup doesn't inadvertently generate +invalid HTML or XML:: + + soup = BeautifulSoup("

The law firm of Dewey, Cheatem, & Howe

") + soup.p + #

The law firm of Dewey, Cheatem, & Howe

+ +You can change this behavior by providing a value for the +``formatter`` argument to ``prettify()``, ``encode()``, or +``decode()``. Beautiful Soup recognizes four possible values for +``formatter`` + +The default is ``formatter="minimal"``. Strings will only be processed +enough to ensure that Beautiful Soup generates valid HTML/XML:: + + french = "

Il a dit <<Sacré bleu!>>

" + soup = BeautifulSoup(french) + print(soup.prettify(formatter="minimal")) + # + # + #

+ # Il a dit <<Sacré bleu!>> + #

+ # + # + +``formatter="html"`` will convert Unicode characters to HTML entities +whenever possible:: + + print(soup.prettify(formatter="html")) + # + # + #

+ # Il a dit <<Sacré bleu!>> + #

+ # + # + +If you pass in ``formatter=None``, Beautiful Soup will not modify +strings at all on output. This is the fastest option, but it may lead +to Beautiful Soup generating invalid HTML/XML, as in this example:: + + print(soup.prettify(formatter=None)) + # + # + #

+ # Il a dit <> + #

+ # + # + + +Finally, if you pass in a function for ``formatter``, Beautiful Soup +will call that function once for every string in the document. You can +do whatever you want in this function. Here's a formatter that +converts strings to uppercase and does absolutely nothing else:: + + def uppercase(str): + return str.upper() + + print(soup.prettify(formatter=uppercase)) + # + # + #

+ # IL A DIT <> + #

+ # + # + +If you're writing your own function, you should know about the +``EntitySubstitution`` class in the ``bs4.dammit`` module. This class +implements Beautiful Soup's standard formatters as class methods: the +"html" formatter is ``EntitySubstitution.substitute_html``, and the +"minimal" formatter is ``EntitySubstitution.substitute_xml``. You can +use these functions to simulate ``formatter=html`` or +``formatter==minimal`` but and then do something in addition. + +Here's an example that converts strings to uppercase, ``and`` replaces +Unicode characters with HTML entities whenever possible:: - soup.encode(substitute_html_entities=True) - # '“Hello,” he said.' + from bs4.dammit import EntitySubstitution + def uppercase_and_substitute_html_entities(str): + return EntitySubstitution.substitute_html(str.upper()) + + print(soup.prettify(formatter=uppercase_and_substitute_html_entities)) + # + # + #

+ # IL A DIT <<SACRÉ BLEU!>> + #

+ # + # ``get_text()`` -------------- -- cgit v1.2.3