From d91ac0c2a204fa79b4796cf079929fbbc6d214c8 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 1 Mar 2012 08:29:35 -0500 Subject: Initial port of code and tests. --- bs4/element.py | 108 +++++++++++++++++++++++ bs4/tests/test_tree.py | 228 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 336 insertions(+) diff --git a/bs4/element.py b/bs4/element.py index 650bacf..c660359 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -411,6 +411,114 @@ class PageElement(object): yield i i = i.parent + # Methods for supporting CSS selectors. + + tag_name_re = re.compile('^[a-z0-9]+$') + + # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute + # Tag + attribselect_re = re.compile( + r'^(?P\w+)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + + r'=?"?(?P[^\]"]*)"?\]$' + ) + + def _attribute_checker(self, operator, attribute, value=''): + """Create a function that performs a CSS selector operation. + + Takes an operator, attribute and optional value. Returns a + function that will return True for elements that match that + combination. + """ + if operator == '=': + # string representation of attribute is equal to value + return lambda el: str(el.get(attribute)) == value + elif operator == '~': + # string representation of attribute includes value as one + # of a set of space separated tokens + return lambda el: value in str(el.get(attribute, '')).split() + elif operator == '^': + # string representation of attribute starts with value + return lambda el: str(el.get(attribute, '')).startswith(value) + elif operator == '$': + # string represenation of attribute ends with value + return lambda el: str(el.get(attribute, '')).endswith(value) + elif operator == '*': + # string representation of attribute contains value + return lambda el: value in str(el.get(attribute, '')) + elif operator == '|': + # string representation of attribute is either exactly + # value or starts with value- + return lambda el: ( + str(el.get(attribute, '')) == value + or str(el.get(attribute, '')).startswith('%s-' % value)) + else: + return lambda el: el.has_key(attribute) + + def select(self, selector): + """Perform a CSS selection operation on the current element.""" + if selector == 'p[class~="class1"]': + import pdb; pdb.set_trace() + tokens = selector.split() + current_context = [self] + for token in tokens: + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag, attribute, operator, value = m.groups() + if not tag: + tag = True + checker = self._attribute_checker(operator, attribute, value) + found = [] + for context in current_context: + found.extend([el for el in context.find_all(tag) if checker(el)]) + current_context = found + continue + if '#' in token: + # ID selector + tag, id = token.split('#', 1) + if tag == "": + tag = True + el = current_context[0].find(tag, {'id': id}) + if el is None: + return [] # No match + current_context = [el] + continue + if '.' in token: + # Class selector + tag, klass = token.split('.', 1) + if not tag: + tag = True + found = [] + for context in current_context: + found.extend( + context.find_all( + tag, + {'class': lambda attr: attr and klass in attr.split()} + ) + ) + current_context = found + continue + if token == '*': + # Star selector + found = [] + for context in current_context: + found.extend(context.findAll(True)) + current_context = found + continue + # Here we should just have a regular tag + if not self.tag_name_re.match(token): + return [] + found = [] + for context in current_context: + found.extend(context.findAll(token)) + current_context = found + return current_context + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 4dda90e..39f78f7 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1300,3 +1300,231 @@ class TestNavigableStringSubclasses(SoupTest): soup.insert(1, doctype) self.assertEqual(soup.encode(), b"\n") + +class TestSoupSelector(SoupTest): + + HTML = """ + + + The title + + + + +

An H1

Some text

Some more text

An H2

Another

+ Bob +

Another H2

+ me +

English

English UK

English US

French

+ + + + + +""" + + def setUp(self): + self.soup = BeautifulSoup(self.HTML) + + def assertSelects(self, selector, expected_ids): + el_ids = [el['id'] for el in self.soup.select(selector)] + el_ids.sort() + expected_ids.sort() + self.assertEqual(expected_ids, el_ids, + "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + ) + + assertSelect = assertSelects + + def assertSelectMultiple(self, *tests): + for selector, expected_ids in tests: + self.assertSelect(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') + self.assertEqual(els[0].contents, [u'The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') + self.assertEqual(len(els), 3) + for div in els: + self.assertEqual(div.name, 'div') + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assertSelects('div div', ['inner']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assertSelects(selector, ['main', 'inner', 'footer']) + + def test_tag_no_match(self): + self.assertEqual(len(self.soup.select('del')), 0) + + def test_invalid_tag(self): + self.assertEqual(len(self.soup.select('tag%t')), 0) + + def test_header_tags(self): + self.assertSelectMultiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'p') + self.assertEqual(els[0]['class'], ['onep']) + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + self.assertEqual(len(els), 0) + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assertSelects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + self.assertEqual(len(els), 0) + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + self.assertEqual(len(els), 3) + for el in els: + self.assertEqual(el.name, 'p') + self.assertEqual(els[1]['class'], ['onep']) + self.assert_(not els[0].has_key('class')) + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + self.assertEqual(len(self.soup.select(selector)), 0) + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assertSelects(selector, ['pmulti']) + + def test_attribute_equals(self): + self.assertSelectMultiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assertSelectMultiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assertSelectMultiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ) + + def test_attribute_endswith(self): + self.assertSelectMultiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['l1', 'p1', 'header1']), + ('div[id$="1"]', []), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assertSelectMultiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('a[href*="http://"]', ['bob', 'me']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['l1', 'p1', 'header1']), + ('div[id*="1"]', []), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ) + + def test_attribute_exact_or_hypen(self): + self.assertSelectMultiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assertSelectMultiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ) -- cgit v1.2.3 From 8bbc84dfc6324a32066b87cf3a78ce0eb719e289 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 1 Mar 2012 08:49:27 -0500 Subject: Got tests to pass on Python 2 and Python 3. --- bs4/element.py | 40 +++++++++++++++++++++++++++------------- bs4/tests/test_tree.py | 2 +- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/bs4/element.py b/bs4/element.py index c660359..3baafe3 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -427,6 +427,17 @@ class PageElement(object): r'=?"?(?P[^\]"]*)"?\]$' ) + def _attr_value_as_string(self, value, default=None): + """Force an attribute value into a string representation. + + A multi-valued attribute will be converted into a + space-separated stirng. + """ + value = self.get(value, default) + if isinstance(value, list) or isinstance(value, tuple): + value =" ".join(value) + return value + def _attribute_checker(self, operator, attribute, value=''): """Create a function that performs a CSS selector operation. @@ -436,33 +447,36 @@ class PageElement(object): """ if operator == '=': # string representation of attribute is equal to value - return lambda el: str(el.get(attribute)) == value + return lambda el: el._attr_value_as_string(attribute) == value elif operator == '~': - # string representation of attribute includes value as one - # of a set of space separated tokens - return lambda el: value in str(el.get(attribute, '')).split() + def _includes_value(element): + attribute_value = element.get(attribute, []) + if not isinstance(attribute_value, list): + attribute_value = attribute_value.split() + return value in attribute_value + return _includes_value elif operator == '^': # string representation of attribute starts with value - return lambda el: str(el.get(attribute, '')).startswith(value) + return lambda el: el._attr_value_as_string(attribute, '').startswith(value) elif operator == '$': # string represenation of attribute ends with value - return lambda el: str(el.get(attribute, '')).endswith(value) + return lambda el: el._attr_value_as_string(attribute, '').endswith(value) elif operator == '*': # string representation of attribute contains value - return lambda el: value in str(el.get(attribute, '')) + return lambda el: value in el._attr_value_as_string(attribute, '') elif operator == '|': # string representation of attribute is either exactly # value or starts with value- - return lambda el: ( - str(el.get(attribute, '')) == value - or str(el.get(attribute, '')).startswith('%s-' % value)) + def _is_or_starts_with_dash(element): + attribute_value = element._attr_value_as_string(attribute, '') + return (attribute_value == value or attribute_value.startswith( + value + '-')) + return _is_or_starts_with_dash else: - return lambda el: el.has_key(attribute) + return lambda el: el.has_attr(attribute) def select(self, selector): """Perform a CSS selection operation on the current element.""" - if selector == 'p[class~="class1"]': - import pdb; pdb.set_trace() tokens = selector.split() current_context = [self] for token in tokens: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 39f78f7..bfc4218 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1412,7 +1412,7 @@ class TestSoupSelector(SoupTest): for el in els: self.assertEqual(el.name, 'p') self.assertEqual(els[1]['class'], ['onep']) - self.assert_(not els[0].has_key('class')) + self.assertFalse(els[0].has_key('class')) def test_a_bunch_of_emptys(self): for selector in ('div#main del', 'div#main div.oops', 'div div#main'): -- cgit v1.2.3 From 4a5136d31bf07a7b28b58343f0c32e41d895e110 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 1 Mar 2012 09:06:25 -0500 Subject: Test that CSS selectors work within the tree as well as at the top level. --- AUTHORS.txt | 10 +++++++--- NEWS.txt | 4 ++++ bs4/element.py | 20 ++++++++++++-------- bs4/tests/test_tree.py | 11 ++++++++++- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/AUTHORS.txt b/AUTHORS.txt index e093cd6..2ac8fcc 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -11,12 +11,16 @@ of UnicodeDammit. Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful Soup 4 working under Python 3. +Simon Willison wrote soupselect, which was used to make Beautiful Soup +support CSS selectors. + Sam Ruby helped with a lot of edge cases. Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his work in solving the nestable tags conundrum. -The following people have contributed patches to Beautiful Soup: +An incomplete list of people have contributed patches to Beautiful +Soup: Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris @@ -26,8 +30,8 @@ The following people have contributed patches to Beautiful Soup: Samastur, Jouni Sepp�nen, Alexander Schmolck, Andy Theyers, Glyn Webster, Paul Wright, Danny Yoo -The following people made suggestions or found bugs or found ways to -break Beautiful Soup: +An incomplete list of people who made suggestions or found bugs or +found ways to break Beautiful Soup: Hanno B�ck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, diff --git a/NEWS.txt b/NEWS.txt index 3079aa1..fff19ad 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,3 +1,7 @@ += 4.0.0b10 () = + +* Added support for CSS selectors, taken from the soupselect project. + = 4.0.0b9 (20110228) = * Fixed the string representation of DOCTYPEs that have both a public diff --git a/bs4/element.py b/bs4/element.py index 3baafe3..584e171 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -446,9 +446,11 @@ class PageElement(object): combination. """ if operator == '=': - # string representation of attribute is equal to value + # string representation of `attribute` is equal to `value` return lambda el: el._attr_value_as_string(attribute) == value elif operator == '~': + # space-separated list representation of `attribute` + # contains `value` def _includes_value(element): attribute_value = element.get(attribute, []) if not isinstance(attribute_value, list): @@ -456,17 +458,19 @@ class PageElement(object): return value in attribute_value return _includes_value elif operator == '^': - # string representation of attribute starts with value - return lambda el: el._attr_value_as_string(attribute, '').startswith(value) + # string representation of `attribute` starts with `value` + return lambda el: el._attr_value_as_string( + attribute, '').startswith(value) elif operator == '$': - # string represenation of attribute ends with value - return lambda el: el._attr_value_as_string(attribute, '').endswith(value) + # string represenation of `attribute` ends with `value` + return lambda el: el._attr_value_as_string( + attribute, '').endswith(value) elif operator == '*': - # string representation of attribute contains value + # string representation of `attribute` contains `value` return lambda el: value in el._attr_value_as_string(attribute, '') elif operator == '|': - # string representation of attribute is either exactly - # value or starts with value- + # string representation of `attribute` is either exactly + # `value` or starts with `value` and then a dash. def _is_or_starts_with_dash(element): attribute_value = element._attr_value_as_string(attribute, '') return (attribute_value == value or attribute_value.startswith( diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index bfc4218..3f32736 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1301,7 +1301,7 @@ class TestNavigableStringSubclasses(SoupTest): self.assertEqual(soup.encode(), b"\n") -class TestSoupSelector(SoupTest): +class TestSoupSelector(TreeTest): HTML = """ @@ -1528,3 +1528,12 @@ class TestSoupSelector(SoupTest): ('[blah]', []), ('p[blah]', []), ) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self.soup.find("div", id="main") + selected = inner.select("div") + # The

tag was selected. The

+ # tag was not. + self.assertSelectsIDs(selected, ['inner']) -- cgit v1.2.3 From b577f965d28031628406091cc36466353795ced3 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 1 Mar 2012 10:37:42 -0500 Subject: Updated docs. --- NEWS.txt | 2 +- doc | 1 - doc/source/index.rst | 2633 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2634 insertions(+), 2 deletions(-) delete mode 120000 doc create mode 100644 doc/source/index.rst diff --git a/NEWS.txt b/NEWS.txt index fff19ad..944c677 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,6 +1,6 @@ = 4.0.0b10 () = -* Added support for CSS selectors, taken from the soupselect project. +* Added support for simple CSS selectors, taken from the soupselect project. = 4.0.0b9 (20110228) = diff --git a/doc b/doc deleted file mode 120000 index 43a5bc0..0000000 --- a/doc +++ /dev/null @@ -1 +0,0 @@ -bs4/doc/ \ No newline at end of file diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 0000000..d13bd17 --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,2633 @@ +Beautiful Soup Documentation +============================ + +.. image:: 6.1.jpg + :align: right + :alt: "The Fish-Footman began by producing from under his arm a great letter, nearly as large as himself." + +`Beautiful Soup `_ is a +Python library for pulling data out of HTML and XML files. It works +with your favorite parser to provide idiomatic ways of navigating, +searching, and modifying the parse tree. It commonly saves programmers +hours or days of work. + +These instructions illustrate all major features of Beautiful Soup 4, +with examples. I show you what the library is good for, how it works, +how to use it, how to make it do what you want, and what to do when it +violates your expectations. + +The examples in this documentation should work the same way in Python +2.7 and Python 3.2. + +You might be looking for the documentation for `Beautiful Soup 3 +`_. If +you want to learn about the differences between Beautiful Soup 3 and +Beautiful Soup 4, see `Porting code to BS4`_. + +Getting help +------------ + +If you have questions about Beautiful Soup, or run into problems, +`send mail to the discussion group +`_. + +Quick Start +=========== + +Here's an HTML document I'll be using as an example throughout this +document. It's part of a story from `Alice in Wonderland`:: + + html_doc = """ + The Dormouse's story + +

The Dormouse's story

+ +

Once upon a time there were three little sisters; and their names were + Elsie, + Lacie and + Tillie; + and they lived at the bottom of a well.

+ +

...

+ """ + +Running the "three sisters" document through Beautiful Soup gives us a +``BeautifulSoup`` object, which represents the document as a nested +data structure:: + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc) + + print(soup.prettify()) + # + # + # + # The Dormouse's story + # + # + # + #

+ # + # The Dormouse's story + # + #

+ #

+ # Once upon a time there were three little sisters; and their names were + # + # Elsie + # + # , + # + # Lacie + # + # and + # + # Tillie + # + # ; and they lived at the bottom of a well. + #

+ #

+ # ... + #

+ # + # + +Here are some simple ways to navigate that data structure:: + + soup.title + # The Dormouse's story + + soup.title.name + # u'title' + + soup.title.string + # u'The Dormouse's story' + + soup.title.parent.name + # u'head' + + soup.p + #

The Dormouse's story

+ + soup.p['class'] + # u'title' + + soup.a + # Elsie + + soup.find_all('a') + # [Elsie, + # Lacie, + # Tillie] + + soup.find(id="link3") + # Tillie + +One common task is extracting all the URLs found within a page's tags:: + + for link in soup.find_all('a'): + print(link.get('href')) + # http://example.com/elsie + # http://example.com/lacie + # http://example.com/tillie + +Another common task is extracting all the text from a page:: + + print(soup.get_text()) + # The Dormouse's story + # + # The Dormouse's story + # + # Once upon a time there were three little sisters; and their names were + # Elsie, + # Lacie and + # Tillie; + # and they lived at the bottom of a well. + # + # ... + +Does this look like what you need? If so, read on. + +Installing Beautiful Soup +========================= + +Beautiful Soup 4 is published through PyPi, so you can install it with +``easy_install`` or ``pip``. The package name is ``beautifulsoup4``, +and the same package works on Python 2 and Python 3. + +:kbd:`$ easy_install beautifulsoup4` + +:kbd:`$ pip install beautifulsoup4` + +(The ``BeautifulSoup`` package is probably `not` what you want. That's +the previous major release, `Beautiful Soup 3`_. Lots of software uses +BS3, so it's still available, but if you're writing new code you +should install ``beautifulsoup4``.) + +You can also `download the Beautiful Soup 4 source tarball +`_ and +install it with ``setup.py``. The license for Beautiful Soup allows +you to package the entire library with your application, allowing you +to copy the ``bs4`` directory into your application's codebase. + +I use Python 2.7 and Python 3.2 to develop Beautiful Soup, but it +should work with other recent versions. + +.. _parser-installation: + +Be sure to install a good parser! +--------------------------------- + +Beautiful Soup uses a plugin system that supports a number of popular +Python parsers. If no third-party parsers are installed, Beautiful +Soup uses the HTML parser that comes with Python. In recent releases +of Python (2.7.3 and 3.2.2), this parser is excellent at handling bad +HTML. Unfortunately, in older releases, it's not very good at all. + +Even if you're using a recent release of Python, I recommend you +install the `lxml parser `_ if you can. Its +reliability is good on both HTML and XML, and it's much faster than +Python's built-in parser. Beautiful Soup will detect that you have +lxml installed, and use it instead of Python's built-in parser. + +Depending on your setup, you might install lxml with one of these commands: + +:kbd:`$ apt-get install python-lxml` + +:kbd:`$ easy_install lxml` + +:kbd:`$ pip install lxml` + +If you're using Python 2, another alternative is the pure-Python +`html5lib parser `_, which parses +HTML the way a web browser does. Depending on your setup, you might +install html5lib with one of these commands: + +:kbd:`$ apt-get install python-html5lib` + +:kbd:`$ easy_install html5lib` + +:kbd:`$ pip install html5lib` + +Making the soup +=============== + +To parse a document, pass it into the ``BeautifulSoup`` +constructor. You can pass in a string or an open filehandle:: + + from bs4 import BeautifulSoup + + soup = BeautifulSoup(open("index.html")) + + soup = BeautifulSoup("data") + +First, the document is converted to Unicode, and HTML entities are +converted to Unicode characters:: + + BeautifulSoup("Sacré bleu!") + Sacré bleu! + +Beautiful Soup then parses the document using the best available +parser. It will use an HTML parser unless you specifically tell it to +use an XML parser. (See `Choosing a parser`_.) + +Kinds of objects +================ + +Beautiful Soup transforms a complex HTML document into a complex tree +of Python objects. But you'll only ever have to deal with about four +`kinds` of objects. + +.. _Tag: + +``Tag`` +------- + +A ``Tag`` object corresponds to an XML or HTML tag in the original document:: + + soup = BeautifulSoup('Extremely bold') + tag = soup.b + type(tag) + # + +Tags have a lot of attributes and methods, and I'll cover most of them +in `Navigating the tree`_ and `Searching the tree`_. For now, the most +important features of a tag are its name and attributes. + +Name +^^^^ + +Every tag has a name, accessible as ``.name``:: + + tag.name + # u'b' + +If you change a tag's name, the change will be reflected in any HTML +markup generated by Beautiful Soup:: + + tag.name = "blockquote" + tag + #

Extremely bold

+ +Attributes +^^^^^^^^^^ + +A tag may have any number of attributes. The tag ```` has an attribute "class" whose value is +"boldest". You can access a tag's attributes by treating the tag like +a dictionary:: + + tag['class'] + # u'boldest' + +You can access that dictionary directly as ``.attrs``:: + + tag.attrs + # {u'class': u'boldest'} + +You can add, remove, and modify a tag's attributes. Again, this is +done by treating the tag as a dictionary:: + + tag['class'] = 'verybold' + tag['id'] = 1 + tag + #
Extremely bold
+ + del tag['class'] + del tag['id'] + tag + #
Extremely bold
+ +.. _multivalue: + +Multi-valued attributes +&&&&&&&&&&&&&&&&&&&&&&& + +HTML 4 defines a few attributes that can have multiple values. HTML 5 +removes a couple of them, but defines a few more. The most common +multi-valued attribute is ``class`` (that is, a tag can have more than +one CSS class). Others include ``rel``, ``rev``, ``accept-charset``, +``headers``, and ``accesskey``. Beautiful Soup presents the value(s) +of a multi-valued attribute as a list:: + + css_soup = BeautifulSoup('
') + css_soup.p['class'] + # ["body", "strikeout"] + + css_soup = BeautifulSoup('
') + css_soup.p['class'] + # ["body"] + +If an attribute `looks` like it has more than one value, but it's not +a multi-valued attribute as defined by any version of the HTML +standard, Beautiful Soup will leave the attribute alone:: + + id_soup = BeautifulSoup('
') + id_soup.p['id'] + # 'my id' + +When you turn a tag back into a string, multiple attribute values are +consolidated:: + + rel_soup = BeautifulSoup('Back to the homepage
') + rel_soup.a['rel'] + # ['index'] + rel_soup.a['rel'] = ['index', 'contents'] + print(rel_soup.p) + #
Back to the homepage
+ +If you parse a document as XML, there are no multi-valued attributes:: + + xml_soup = BeautifulSoup('
', 'xml') + xml_soup.p['class'] + # u'body strikeout' + + + +``NavigableString`` +------------------- + +A string corresponds to a bit of text within a tag. Beautiful Soup +defines the ``NavigableString`` class to contain these bits of text:: + + tag.string + # u'Extremely bold' + type(tag.string) + # + +A ``NavigableString`` is just like a Python Unicode string, except +that it also supports some of the features described in `Navigating +the tree`_ and `Searching the tree`_. You can convert a +``NavigableString`` to a Unicode string with ``unicode()``:: + + unicode_string = unicode(tag.string) + unicode_string + # u'Extremely bold' + type(unicode_string) + # + +You can't edit a string in place, but you can replace one string with +another, using :ref:`replace_with`:: + + tag.string.replace_with("No longer bold") + tag + #
No longer bold
+ +``NavigableString`` supports most of the features described in +`Navigating the tree`_ and `Searching the tree`_, but not all of +them. In particular, since a string can't contain anything (the way a +tag may contain a string or another tag), strings don't support the +``.contents`` or ``.string`` attributes, or the `find()` method. + +``BeautifulSoup`` +----------------- + +The ``BeautifulSoup`` object itself represents the document as a +whole. For most purposes, you can treat it as a :ref:`Tag` +object. This means it supports most of the methods described in +`Navigating the tree`_ and `Searching the tree`_. + +Since the ``BeautifulSoup`` object doesn't correspond to an actual +HTML or XML tag, it has no name and no attributes. But sometimes it's +useful to look at its ``.name``, so it's been given the special +``.name`` "[document]":: + + soup.name + # u'[document]' + +Comments and other special strings +---------------------------------- + +``Tag``, ``NavigableString``, and ``BeautifulSoup`` cover almost +everything you'll see in an HTML or XML file, but there are a few +leftover bits. The only one you'll probably ever need to worry about +is the comment:: + + markup = "" + soup = BeautifulSoup(markup) + comment = soup.b.string + type(comment) + # + +The ``Comment`` object is just a special type of ``NavigableString``:: + + comment + # u'Hey, buddy. Want to buy a used parser' + +But when it appears as part of an HTML document, a ``Comment`` is +displayed with special formatting:: + + print(soup.b.prettify()) + # + # + # + +Beautiful Soup defines classes for anything else that might show up in +an XML document: ``CData``, ``ProcessingInstruction``, +``Declaration``, and ``Doctype``. Just like ``Comment``, these classes +are subclasses of ``NavigableString`` that add something extra to the +string. Here's an example that replaces the comment with a CDATA +block:: + + from bs4 import CData + cdata = CData("A CDATA block") + comment.replace_with(cdata) + + print(soup.b.prettify()) + # + # + # + + +Navigating the tree +=================== + +Here's the "Three sisters" HTML document again:: + + html_doc = """ + The Dormouse's story + +
The Dormouse's story
+ +
Once upon a time there were three little sisters; and their names were + Elsie, + Lacie and + Tillie; + and they lived at the bottom of a well.
+ +
...
+ """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc) + +I'll use this as an example to show you how to move from one part of +a document to another. + +Going down +---------- + +Tags may contain strings and other tags. These elements are the tag's +`children`. Beautiful Soup provides a lot of different attributes for +navigating and iterating over a tag's children. + +Note that Beautiful Soup strings don't support any of these +attributes, because a string can't have children. + +Navigating using tag names +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The simplest way to navigate the parse tree is to say the name of the +tag you want. If you want the tag, just say ``soup.head``:: + + soup.head + # The Dormouse's story + + soup.title + # The Dormouse's story + +You can do use this trick again and again to zoom in on a certain part +of the parse tree. This code gets the first tag beneath the tag:: + + soup.body.b + # The Dormouse's story + +Using a tag name as an attribute will give you only the `first` tag by that +name:: + + soup.a + # Elsie + +If you need to get `all` the tags, or anything more complicated +than the first tag with a certain name, you'll need to use one of the +methods described in `Searching the tree`_, such as `find_all()`:: + + soup.find_all('a') + # [Elsie, + # Lacie, + # Tillie] + +``.contents`` and ``.children`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A tag's children are available in a list called ``.contents``:: + + head_tag = soup.head + head_tag + # The Dormouse's story + + head_tag.contents + [The Dormouse's story] + + title_tag = head_tag.contents[0] + title_tag + # The Dormouse's story + title_tag.contents + # [u'The Dormouse's story'] + +The ``BeautifulSoup`` object itself has children. In this case, the + tag is the child of the ``BeautifulSoup`` object.:: + + len(soup.contents) + # 1 + soup.contents[0].name + # u'html' + +A string does not have ``.contents``, because it can't contain +anything:: + + text = title_tag.contents[0] + text.contents + # AttributeError: 'NavigableString' object has no attribute 'contents' + +Instead of getting them as a list, you can iterate over a tag's +children using the ``.children`` generator:: + + for child in title_tag.children: + print(child) + # The Dormouse's story + +``.descendants`` +^^^^^^^^^^^^^^^^ + +The ``.contents`` and ``.children`` attributes only consider a tag's +`direct` children. For instance, the tag has a single direct +child--the tag:: + + head_tag.contents + # [<title>The Dormouse's story] + +But the tag itself has a child: the string "The Dormouse's +story". There's a sense in which that string is also a child of the +<head> tag. The ``.descendants`` attribute lets you iterate over `all` +of a tag's children, recursively: its direct children, the children of +its direct children, and so on:: + + for child in head_tag.descendants: + print(child) + # <title>The Dormouse's story + # The Dormouse's story + +The tag has only one child, but it has two descendants: the + tag and the <title> tag's child. The ``BeautifulSoup`` object +only has one direct child (the <html> tag), but it has a whole lot of +descendants:: + + len(list(soup.children)) + # 1 + len(list(soup.descendants)) + # 25 + +.. _.string: + +``.string`` +^^^^^^^^^^^ + +If a tag has only one child, and that child is a string, the string is +made available as ``.string``:: + + title_tag.string + # u'The Dormouse's story' + +If a tag's only child is another tag, and `that` tag has a +``.string``, then the parent tag is considered to have the same +``.string`` as its child:: + + head_tag.contents + # [<title>The Dormouse's story] + + head_tag.string + # u'The Dormouse's story' + +If a tag contains more than one thing, then it's not clear what +``.string`` should refer to, so ``.string`` is defined to be +``None``:: + + print(soup.html.string) + # None + +.. _string-generators: + +``.strings`` and ``stripped_strings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If there's more than one thing inside a tag, you can still look at +just the strings. Use the ``.strings`` generator:: + + for string in soup.strings: + print(repr(string)) + # u"The Dormouse's story" + # u'\n\n' + # u"The Dormouse's story" + # u'\n\n' + # u'Once upon a time there were three little sisters; and their names were\n' + # u'Elsie' + # u',\n' + # u'Lacie' + # u' and\n' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # u'...' + # u'\n' + +These strings tend to have a lot of extra whitespace, which you can +remove by using the ``.stripped_strings`` generator instead:: + + for string in soup.stripped_strings: + print(repr(string)) + # u"The Dormouse's story" + # u"The Dormouse's story" + # u'Once upon a time there were three little sisters; and their names were' + # u'Elsie' + # u',' + # u'Lacie' + # u'and' + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'...' + +Here, strings consisting entirely of whitespace are ignored, and +whitespace at the beginning and end of strings is removed. + +Going up +-------- + +Continuing the "family tree" analogy, every tag and every string has a +`parent`: the tag that contains it. + +.. _.parent: + +``.parent`` +^^^^^^^^^^^ + +You can access an element's parent with the ``.parent`` attribute. In +the example "three sisters" document, the tag is the parent +of the tag:: + + title_tag = soup.title + title_tag + # <title>The Dormouse's story + title_tag.parent + # The Dormouse's story + +The title string itself has a parent: the tag that contains +it:: + + title_tag.string.parent + # <title>The Dormouse's story + +The parent of a top-level tag like is the ``BeautifulSoup`` object +itself:: + + html_tag = soup.html + type(html_tag.parent) + # + +And the ``.parent`` of a ``BeautifulSoup`` object is defined as None:: + + print(soup.parent) + # None + +.. _.parents: + +``.parents`` +^^^^^^^^^^^^ + +You can iterate over all of an element's parents with +``.parents``. This example uses ``.parents`` to travel from an tag +buried deep within the document, to the very top of the document:: + + link = soup.a + link + # Elsie + for parent in link.parents: + if parent is None: + print(parent) + else: + print(parent.name) + # p + # body + # html + # [document] + # None + +Going sideways +-------------- + +Consider a simple document like this:: + + sibling_soup = BeautifulSoup("text1text2") + print(sibling_soup.prettify()) + # + # + # + # + # text1 + # + # + # text2 + # + # + # + # + +The tag and the tag are at the same level: they're both direct +children of the same tag. We call them `siblings`. When a document is +pretty-printed, siblings show up at the same indentation level. You +can also use this relationship in the code you write. + +``.next_sibling`` and ``.previous_sibling`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can use ``.next_sibling`` and ``.previous_sibling`` to navigate +between page elements that are on the same level of the parse tree:: + + sibling_soup.b.next_sibling + # text2 + + sibling_soup.c.previous_sibling + # text1 + +The tag has a ``.next_sibling``, but no ``.previous_sibling``, +because there's nothing before the tag `on the same level of the +tree`. For the same reason, the tag has a ``.previous_sibling`` +but no ``.next_sibling``:: + + print(sibling_soup.b.previous_sibling) + # None + print(sibling_soup.c.next_sibling) + # None + +The strings "text1" and "text2" are `not` siblings, because they don't +have the same parent:: + + sibling_soup.b.string + # u'text1' + + print(sibling_soup.b.string.next_sibling) + # None + +In real documents, the ``.next_sibling`` or ``.previous_sibling`` of a +tag will usually be a string containing whitespace. Going back to the +"three sisters" document:: + + Elsie + Lacie + Tillie + +You might think that the ``.next_sibling`` of the first tag would +be the second tag. But actually, it's a string: the comma and +newline that separate the first tag from the second:: + + link = soup.a + link + # Elsie + + link.next_sibling + # u',\n' + +The second tag is actually the ``.next_sibling`` of the comma:: + + link.next_sibling.next_sibling + # Lacie + +.. _sibling-generators: + +``.next_siblings`` and ``.previous_siblings`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can iterate over a tag's siblings with ``.next_siblings`` or +``.previous_siblings``:: + + for sibling in soup.a.next_siblings: + print(repr(sibling)) + # u',\n' + # Lacie + # u' and\n' + # Tillie + # u'; and they lived at the bottom of a well.' + # None + + for sibling in soup.find(id="link3").previous_siblings: + print(repr(sibling)) + # ' and\n' + # Lacie + # u',\n' + # Elsie + # u'Once upon a time there were three little sisters; and their names were\n' + # None + +Going back and forth +-------------------- + +Take a look at the beginning of the "three sisters" document:: + + The Dormouse's story +
The Dormouse's story
+ +An HTML parser takes this string of characters and turns it into a +series of events: "open an tag", "open a tag", "open a + tag", "add a string", "close the <title> tag", "open a <p> +tag", and so on. Beautiful Soup offers tools for reconstructing the +initial parse of the document. + +.. _element-generators: + +``.next_element`` and ``.previous_element`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``.next_element`` attribute of a string or tag points to whatever +was parsed immediately afterwards. It might be the same as +``.next_sibling``, but it's usually drastically different. + +Here's the final <a> tag in the "three sisters" document. Its +``.next_sibling`` is a string: the conclusion of the sentence that was +interrupted by the start of the <a> tag.:: + + last_a_tag = soup.find("a", id="link3") + last_a_tag + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + + last_a_tag.next_sibling + # '; and they lived at the bottom of a well.' + +But the ``.next_element`` of that <a> tag, the thing that was parsed +immediately after the <a> tag, is `not` the rest of that sentence: +it's the word "Tillie":: + + last_a_tag.next_element + # u'Tillie' + +That's because in the original markup, the word "Tillie" appeared +before that semicolon. The parser encountered an <a> tag, then the +word "Tillie", then the closing </a> tag, then the semicolon and rest of +the sentence. The semicolon is on the same level as the <a> tag, but the +word "Tillie" was encountered first. + +The ``.previous_element`` attribute is the exact opposite of +``.next_element``. It points to whatever element was parsed +immediately before this one:: + + last_a_tag.previous_element + # u' and\n' + last_a_tag.previous_element.next_element + # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> + +``.next_elements`` and ``.previous_elements`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You should get the idea by now. You can use these iterators to move +forward or backward in the document as it was parsed:: + + for element in last_a_tag.next_elements: + print(repr(element)) + # u'Tillie' + # u';\nand they lived at the bottom of a well.' + # u'\n\n' + # <p class="story">...</p> + # u'...' + # u'\n' + # None + +Searching the tree +================== + +Beautiful Soup defines a lot of methods for searching the parse tree, +but they're all very similar. I'm going to spend a lot of time explain +the two most popular methods: ``find()`` and ``find_all()``. The other +methods take almost exactly the same arguments, so I'll just cover +them briefly. + +Once again, I'll be using the "three sisters" document as an example:: + + html_doc = """ + <html><head><title>The Dormouse's story + +
The Dormouse's story
+ +
Once upon a time there were three little sisters; and their names were + Elsie, + Lacie and + Tillie; + and they lived at the bottom of a well.
+ +
...
+ """ + + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_doc) + +By passing in a filter to an argument like ``find_all()``, you can +isolate whatever parts of the document you're interested. + +Kinds of filters +---------------- + +Before talking in detail about ``find_all()`` and similar methods, I +want to show examples of different filters you can pass into these +methods. These filters show up again and again, throughout the +search API. You can use them to filter based on a tag's name, +on its attributes, on the text of a string, or on some combination of +these. + +.. _a string: + +A string +^^^^^^^^ + +The simplest filter is a string. Pass a string to a search method and +Beautiful Soup will perform a match against that exact string. This +code finds all the tags in the document:: + + soup.find_all('b') + # [The Dormouse's story] + +.. _a regular expression: + +A regular expression +^^^^^^^^^^^^^^^^^^^^ + +If you pass in a regular expression object, Beautiful Soup will filter +against that regular expression. This code finds all the tags whose +names start with the letter "b"; in this case, the tag and the + tag:: + + import re + for tag in soup.find_all(re.compile("b.*")): + print(tag.name) + # body + # b + +.. _a list: + +A list +^^^^^^ + +If you pass in a list, Beautiful Soup will allow a string match +against `any` item in that list. This code finds all the tags +`and` all the tags:: + + soup.find_all(["a", "b"]) + # [The Dormouse's story, + # Elsie, + # Lacie, + # Tillie] + +.. _the value True: + +``True`` +^^^^^^^^ + +The value ``True`` matches everything it can. This code finds `all` +the tags in the document, but none of the text strings:: + + for tag in soup.find_all(True): + print(tag.name) + # html + # head + # title + # body + # p + # b + # p + # a + # a + # a + # p + +.. a function: + +A function +^^^^^^^^^^ + +If none of the other matches work for you, define a function that +takes an element as its only argument. The function should return +``True`` if the argument matches, and ``False`` otherwise. + +Here's a function that returns ``True`` if a tag defines the "class" +attribute but doesn't define the "id" attribute:: + + def has_class_but_no_id(tag): + return tag.has_key('class') and not tag.has_key('id') + +Pass this function into ``find_all()`` and you'll pick up all the
+tags:: + + soup.find_all(has_class_but_no_id) + # [
The Dormouse's story
, + #
Once upon a time there were...
, + #
...
] + +This function only picks up the
tags. It doesn't pick up the +tags, because those tags define both "class" and "id". It doesn't pick +up tags like and , because those tags don't define +"class". + +Here's a function that returns ``True`` if a tag is surrounded by +string objects:: + + from bs4 import NavigableString + def surrounded_by_strings(tag): + return (isinstance(tag.next_element, NavigableString) + and isinstance(tag.previous_element, NavigableString)) + + for tag in soup.find_all(surrounded_by_strings): + print tag.name + # p + # a + # a + # a + # p + +Now we're ready to look at the search methods in detail. + +``find_all()`` +-------------- + +Signature: find_all(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`text <text>`, :ref:`limit <limit>`, :ref:`**kwargs <kwargs>`) + +The ``find_all()`` method looks through a tag's descendants and +retrieves `all` descendants that match your filters. I gave several +examples in `Kinds of filters`_, but here are a few more:: + + soup.find_all("title") + # [<title>The Dormouse's story] + + soup.find_all("p", "title") + # [
The Dormouse's story
] + + soup.find_all("a") + # [Elsie, + # Lacie, + # Tillie] + + soup.find_all(id="link2") + # [Lacie] + + import re + soup.find(text=re.compile("sisters")) + # u'Once upon a time there were three little sisters; and their names were\n' + +Some of these should look familiar, but others are new. What does it +mean to pass in a value for ``text``, or ``id``? Why does +``find_all("p", "title")`` find a
tag with the CSS class "title"? +Let's look at the arguments to ``find_all()``. + +.. _name: + +The ``name`` argument +^^^^^^^^^^^^^^^^^^^^^ + +Pass in a value for ``name`` and you'll tell Beautiful Soup to only +consider tags with certain names. Text strings will be ignored, as +will tags whose names that don't match. + +This is the simplest usage:: + + soup.find_all("title") + # [The Dormouse's story] + +Recall from `Kinds of filters`_ that the value to ``name`` can be `a +string`_, `a regular expression`_, `a list`_, `a function`_, or `the value +True`_. + +.. _kwargs: + +The keyword arguments +^^^^^^^^^^^^^^^^^^^^^ + +Any argument that's not recognized will be turned into a filter on tag +attributes. If you pass in a value for an argument called ``id``, +Beautiful Soup will filter against the tag's 'id' attribute:: + + soup.find_all(id='link2') + # [Lacie] + +If you pass in a value for ``href``, Beautiful Soup will filter +against the tag's 'href' attribute:: + + soup.find_all(href=re.compile("elsie")) + # [Elsie] + +You can filter an attribute based on `a string`_, `a regular +expression`_, `a list`_, `a function`_, or `the value True`_. + +This code finds all tags that have an ``id`` attribute, regardless of +what the value is:: + + soup.find_all(id=True) + # [Elsie, + # Lacie, + # Tillie] + +You can filter multiple attributes at once by passing in more than one +keyword argument:: + + soup.find_all(href=re.compile("elsie"), id='link1') + # [three] + +.. _attrs: + +Searching by CSS class +^^^^^^^^^^^^^^^^^^^^^^ + +Instead of using keyword arguments, you can filter tags based on their +attributes by passing a dictionary in for ``attrs``. These two lines of +code are equivalent:: + + soup.find_all(href=re.compile("elsie"), id='link1') + soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'}) + +The ``attrs`` argument would be a pretty obscure feature were it not for +one thing: CSS. It's very useful to search for a tag that has a +certain CSS class, but the name of the CSS attribute, "class", is also a +Python reserved word. + +You can use ``attrs`` to search by CSS class:: + + soup.find_all("a", { "class" : "sister" }) + # [Elsie, + # Lacie, + # Tillie] + +But that's a lot of code for such a common operation. Instead, you can +pass a string `attrs` instead of a dictionary. The string will be used +to restrict the CSS class:: + + soup.find_all("a", "sister") + # [Elsie, + # Lacie, + # Tillie] + +You can also pass in a regular expression, a function or +True. Anything you pass in for ``attrs`` that's not a dictionary will +be used to search against the CSS class:: + + soup.find_all(attrs=re.compile("itl")) + # [
The Dormouse's story
] + + def has_six_characters(css_class): + return css_class is not None and len(css_class) == 6 + + soup.find_all(attrs=has_six_characters) + # [Elsie, + # Lacie, + # Tillie] + +:ref:`Remember ` that a single tag can have multiple +values for its "class" attribute. When you search for a tag that +matches a certain CSS class, you're matching against `any` of its CSS +classes:: + + css_soup = BeautifulSoup('
') + css_soup.find_all("p", "strikeout") + # [
] + + css_soup.find_all("p", "body") + # [
] + +Searching for the string value of the ``class`` attribute won't work:: + + css_soup.find_all("p", "body strikeout") + # [] + +.. _text: + +The ``text`` argument +^^^^^^^^^^^^^^^^^^^^^ + +With ``text`` you can search for strings instead of tags. As with +``name`` and the keyword arguments, you can pass in `a string`_, `a +regular expression`_, `a list`_, `a function`_, or `the value True`_. +Here are some examples:: + + soup.find_all(text="Elsie") + # [u'Elsie'] + + soup.find_all(text=["Tillie", "Elsie", "Lacie"]) + # [u'Elsie', u'Lacie', u'Tillie'] + + soup.find_all(text=re.compile("Dormouse")) + [u"The Dormouse's story", u"The Dormouse's story"] + + def is_the_only_string_within_a_tag(s): + """Return True if this string is the only child of its parent tag.""" + return (s == s.parent.string) + + soup.find_all(text=is_the_only_string_within_a_tag) + # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] + +Although ``text`` is for finding strings, you can combine it with +arguments for finding tags, Beautiful Soup will find all tags whose +``.string`` matches your value for ``text``. This code finds the +tags whose ``.string`` is "Elsie":: + + soup.find_all("a", "Elsie") + # [Elsie] + +.. _limit: + +The ``limit`` argument +^^^^^^^^^^^^^^^^^^^^^^ + +``find_all()`` returns all the tags and strings that match your +filters. This can take a while if the document is large. If you don't +need `all` the results, you can pass in a number for ``limit``. This +works just like the LIMIT keyword in SQL. It tells Beautiful Soup to +stop gathering results after it's found a certain number. + +There are three links in the "three sisters" document, but this code +only finds the first two:: + + soup.find_all("a", limit=2) + # [Elsie, + # Lacie] + +.. _recursive: + +The ``recursive`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you call ``mytag.find_all()``, Beautiful Soup will examine all the +descendants of ``mytag``: its children, its children's children, and +so on. If you only want Beautiful Soup to consider direct children, +you can pass in ``recursive=False``. See the difference here:: + + soup.html.find_all("title") + # [The Dormouse's story] + + soup.html.find_all("title", recursive=False) + # [] + +Here's that part of the document:: + + + + + The Dormouse's story + + + ... + +The tag is beneath the <html> tag, but it's not `directly` +beneath the <html> tag: the <head> tag is in the way. Beautiful Soup +finds the <title> tag when it's allowed to look at all descendants of +the <html> tag, but when ``recursive=False`` restricts it to the +<html> tag's immediate children, it finds nothing. + +Beautiful Soup offers a lot of tree-searching methods (covered below), +and they mostly take the same arguments as ``find_all()``: ``name``, +``attrs``, ``text``, ``limit``, and the keyword arguments. But the +``recursive`` argument is different: ``find_all()`` and ``find()`` are +the only methods that support it. Passing ``recursive=False`` into a +method like ``find_parents()`` wouldn't be very useful. + +Calling a tag is like calling ``find_all()`` +-------------------------------------------- + +Because ``find_all()`` is the most popular method in the Beautiful +Soup search API, you can use a shortcut for it. If you treat the +``BeautifulSoup`` object or a ``Tag`` object as though it were a +function, then it's the same as calling ``find_all()`` on that +object. These two lines of code are equivalent:: + + soup.find_all("a") + soup("a") + +These two lines are also equivalent:: + + soup.title.find_all(text=True) + soup.title(text=True) + +``find()`` +---------- + +Signature: find(:ref:`name <name>`, :ref:`attrs <attrs>`, :ref:`recursive +<recursive>`, :ref:`text <text>`, :ref:`**kwargs <kwargs>`) + +The ``find_all()`` method scans the entire document looking for +results, but sometimes you only want to find one result. If you know a +document only has one <body> tag, it's a waste of time to scan the +entire document looking for more. Rather than passing in ``limit=1`` +every time you call ``find_all``, you can use the ``find()`` +method. These two lines of code are `nearly` equivalent:: + + soup.find_all('title', limit=1) + # [<title>The Dormouse's story] + + soup.find('title') + # The Dormouse's story + +The only difference is that ``find_all()`` returns a list containing +the single result, and ``find()`` just returns the result. + +If ``find_all()`` can't find anything, it returns an empty list. If +``find()`` can't find anything, it returns ``None``:: + + print(soup.find("nosuchtag")) + # None + +Remember the ``soup.head.title`` trick from `Navigating using tag +names`_? That trick works by repeatedly calling ``find()``:: + + soup.head.title + # The Dormouse's story + + soup.find("head").find("title") + # The Dormouse's story + +``find_parents()`` and ``find_parent()`` +---------------------------------------- + +Signature: find_parents(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) + +Signature: find_parent(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) + +I spent a lot of time above covering ``find_all()`` and +``find()``. The Beautiful Soup API defines ten other methods for +searching the tree, but don't be afraid. Five of these methods are +basically the same as ``find_all()``, and the other five are basically +the same as ``find()``. The only differences are in what parts of the +tree they search. + +First let's consider ``find_parents()`` and +``find_parent()``. Remember that ``find_all()`` and ``find()`` work +their way down the tree, looking at tag's descendants. These methods +do the opposite: they work their way `up` the tree, looking at a tag's +(or a string's) parents. Let's try them out, starting from a string +buried deep in the "three daughters" document:: + + a_string = soup.find(text="Lacie") + a_string + # u'Lacie' + + a_string.find_parents("a") + # [Lacie] + + a_string.find_parent("p") + #
Once upon a time there were three little sisters; and their names were + # Elsie, + # Lacie and + # Tillie; + # and they lived at the bottom of a well.
+ + a_string.find_parents("p", class="title") + # [] + +One of the three tags is the direct parent of the string in +question, so our search finds it. One of the three
tags is an +indirect parent of the string, and our search finds that as +well. There's a
tag with the CSS class "title" `somewhere` in the +document, but it's not one of this string's parents, so we can't find +it with ``find_parents()``. + +You may have made the connection between ``find_parent()`` and +``find_parents()``, and the `.parent`_ and `.parents`_ attributes +mentioned earlier. The connection is very strong. These search methods +actually use ``.parents`` to iterate over all the parents, and check +each one against the provided filter to see if it matches. + +``find_next_siblings()`` and ``find_next_sibling()`` +---------------------------------------------------- + +Signature: find_next_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) + +Signature: find_next_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) + +These methods use :ref:`.next_siblings ` to +iterate over the rest of an element's siblings in the tree. The +``find_next_siblings()`` method returns all the siblings that match, +and ``find_next_sibling()`` only returns the first one:: + + first_link = soup.a + first_link + # Elsie + + first_link.find_next_siblings("a") + # [Lacie, + # Tillie] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_next_sibling("p") + #
...
+ +``find_previous_siblings()`` and ``find_previous_sibling()`` +------------------------------------------------------------ + +Signature: find_previous_siblings(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) + +Signature: find_previous_sibling(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) + +These methods use :ref:`.previous_siblings ` to iterate over an element's +siblings that precede it in the tree. The ``find_previous_siblings()`` +method returns all the siblings that match, and +``find_previous_sibling()`` only returns the first one:: + + last_link = soup.find("a", id="link3") + last_link + # Tillie + + last_link.find_previous_siblings("a") + # [Lacie, + # Elsie] + + first_story_paragraph = soup.find("p", "story") + first_story_paragraph.find_previous_sibling("p") + #
The Dormouse's story
+ + +``find_all_next()`` and ``find_next()`` +--------------------------------------- + +Signature: find_all_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) + +Signature: find_next(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) + +These methods use :ref:`.next_elements ` to +iterate over whatever tags and strings that come after it in the +document. The ``find_all_next()`` method returns all matches, and +``find_next()`` only returns the first match:: + + first_link = soup.a + first_link + # Elsie + + first_link.find_all_next(text=True) + # [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n'] + + first_link.find_next("p") + #
...
+ +In the first example, the string "Elsie" showed up, even though it was +contained within the tag we started from. In the second example, +the last
tag in the document showed up, even though it's not in +the same part of the tree as the tag we started from. For these +methods, all that matters is that an element match the filter, and +show up later in the document than the starting element. + +``find_all_previous()`` and ``find_previous()`` +----------------------------------------------- + +Signature: find_all_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`limit `, :ref:`**kwargs `) + +Signature: find_previous(:ref:`name `, :ref:`attrs `, :ref:`text `, :ref:`**kwargs `) + +These methods use :ref:`.previous_elements ` to +iterate over the tags and strings that came before it in the +document. The ``find_all_previous()`` method returns all matches, and +``find_previous()`` only returns the first match:: + + first_link = soup.a + first_link + # Elsie + + first_link.find_all_previous("p") + # [
Once upon a time there were three little sisters; ...
, + #
The Dormouse's story
] + + first_link.find_previous("title") + # The Dormouse's story + +The call to ``find_all_previous("p")`` found the first paragraph in +the document (the one with class="title"), but it also finds the +second paragraph, the
tag that contains the tag we started +with. This shouldn't be too surprising: we're looking at all the tags +that show up earlier in the document than the one we started with. A +
tag that contains an tag must have shown up earlier in the +document. + +CSS selectors +------------- + +Beautiful Soup supports a subset of the `CSS selector standard +`_. Just construct the +selector as a string and pass it into the ``.select()`` method of a +``Tag`` or the ``BeautifulSoup`` object itself. + +You can find tags:: + + soup.select("title") + # [The Dormouse's story] + +Find tags beneath other tags:: + + soup.select("p a") + # [Elsie, Lacie, Tillie] + + soup.select("html head title") + # [The Dormouse's story] + +Find tags by CSS class:: + + soup.select(".sister") + # [Elsie, + # Lacie, + # Tillie] + + soup.select("[class~=sister]") + # [Elsie, + # Lacie, + # Tillie] + +Find tags by ID:: + + soup.select("#link1") + # [Elsie] + + soup.select("a#link2") + # [Lacie] + +Test for the existence of an attribute:: + + soup.select('a[href]') + # [Elsie, + # Lacie, + # Tillie] + +Find tags by attribute value:: + + soup.select('a[href="http://example.com/elsie"]') + # [Elsie] + + soup.select('a[href^="http://example.com/"]') + # [Elsie, + # Lacie, + # Tillie] + + soup.select('a[href$="tillie"]') + # [Tillie] + + soup.select('a[href*=".com/el"]') + # [Elsie] + +This is a convenience for users who know the CSS selector syntax. You +can do all this stuff with the Beautiful Soup API. And if CSS +selectors are all you need, you might as well use lxml directly, +because it's faster. But this lets you `combine` simple CSS selectors +with the Beautiful Soup API. + + +Modifying the tree +================== + +Beautiful Soup's main strength is in searching the parse tree, but you +can also modify the tree and write your changes as a new HTML or XML +document. + +Changing tag names and attributes +--------------------------------- + +I covered this earlier, in `Attributes`_, but it bears repeating. You +can rename a tag, change the values of its attributes, add new +attributes, and delete attributes:: + + soup = BeautifulSoup('Extremely bold') + tag = soup.b + + tag.name = "blockquote" + tag['class'] = 'verybold' + tag['id'] = 1 + tag + #
Extremely bold
+ + del tag['class'] + del tag['id'] + tag + #
Extremely bold
+ + +Modifying ``.string`` +--------------------- + +If you set a tag's ``.string`` attribute, the tag's contents are +replaced with the string you give:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + + tag = soup.a + tag.string = "New link text." + tag + # New link text. + +Be careful: if the tag contained other tags, they and all their +contents will be destroyed. + +``append()`` +------------ + +You can add to a tag's contents with ``Tag.append()``. It works just +like calling ``.append()`` on a Python list:: + + soup = BeautifulSoup("Foo") + soup.a.append("Bar") + + soup + # FooBar + soup.a.contents + # [u'Foo', u'Bar'] + +``BeautifulSoup.new_string()`` and ``.new_tag()`` +------------------------------------------------- + +If you need to add a string to a document, no problem--you can pass a +Python string in to ``append()``, or you can call the factory method +``BeautifulSoup.new_string()``:: + + soup = BeautifulSoup("") + tag = soup.b + tag.append("Hello") + new_string = soup.new_string(" there") + tag.append(new_string) + tag + # Hello there. + tag.contents + # [u'Hello', u' there'] + +What if you need to create a whole new tag? The best solution is to +call the factory method ``BeautifulSoup.new_tag()``:: + + soup = BeautifulSoup("") + original_tag = soup.b + + new_tag = soup.new_tag("a", href="http://www.example.com") + original_tag.append(new_tag) + original_tag + # + + new_tag.string = "Link text." + original_tag + # Link text. + +Only the first argument, the tag name, is required. + +``insert()`` +------------ + +``Tag.insert()`` is just like ``Tag.append()``, except the new element +doesn't necessarily go at the end of its parent's +``.contents``. It'll be inserted at whatever numeric position you +say. It works just like ``.insert()`` on a Python list:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.insert(1, "but did not endorse ") + tag + # I linked to but did not endorse example.com + tag.contents + # [u'I linked to ', u'but did not endorse', example.com] + +``insert_before()`` and ``insert_after()`` +------------------------------------------ + +The ``insert_before()`` method inserts a tag or string immediately +before something else in the parse tree:: + + soup = BeautifulSoup("stop") + tag = soup.new_tag("i") + tag.string = "Don't" + soup.b.string.insert_before(tag) + soup.b + # Don'tstop + +The ``insert_after()`` method moves a tag or string so that it +immediately follows something else in the parse tree:: + + soup.b.i.insert_after(soup.new_string(" ever ")) + soup.b + # Don't ever stop + soup.b.contents + # [Don't, u' ever ', u'stop'] + +``clear()`` +----------- + +``Tag.clear()`` removes the contents of a tag:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + tag = soup.a + + tag.clear() + tag + # + +``extract()`` +------------- + +``PageElement.extract()`` removes a tag or string from the tree. It +returns the tag or string that was extracted:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + a_tag = soup.a + + i_tag = soup.i.extract() + + a_tag + # I linked to + + i_tag + # example.com + + print(i_tag.parent) + None + +At this point you effectively have two parse trees: one rooted at the +``BeautifulSoup`` object you used to parse the document, and one rooted +at the tag that was extracted. You can go on to call ``extract`` on +a child of the element you extracted:: + + my_string = i_tag.string.extract() + my_string + # u'example.com' + + print(my_string.parent) + # None + i_tag + # + + +``decompose()`` +--------------- + +``Tag.decompose()`` removes a tag from the tree, then `completely +destroys it and its contents`:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + a_tag = soup.a + + soup.i.decompose() + + a_tag + # I linked to + + +.. _replace_with: + +``replace_with()`` +------------------ + +``PageElement.replace_with()`` removes a tag or string from the tree, +and replaces it with the tag or string of your choice:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + a_tag = soup.a + + new_tag = soup.new_tag("b") + new_tag.string = "example.net" + a_tag.i.replace_with(new_tag) + + a_tag + # I linked to example.net + +``replace_with()`` returns the tag or string that was replaced, so +that you can examine it or add it back to another part of the tree. + +``replace_with_children()`` +--------------------------- + +``Tag.replace_with_children()`` replaces a tag with whatever's inside +that tag. It's good for stripping out markup:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + a_tag = soup.a + + a_tag.i.replace_with_children() + a_tag + # I linked to example.com + +Like ``replace_with()``, ``replace_with_children()`` returns the tag +that was replaced. + +Output +====== + +Pretty-printing +--------------- + +The ``prettify()`` method will turn a Beautiful Soup parse tree into a +nicely formatted bytestring, with each HTML/XML tag on its own line:: + + markup = 'I linked to example.com' + soup = BeautifulSoup(markup) + soup.prettify() + # '\n \n \n \n \n...' + + print(soup.prettify()) + # + # + # + # + # + # I linked to + # + # example.com + # + # + # + # + +You can call ``prettify()`` on the top-level ``BeautifulSoup`` object, +or on any of its ``Tag`` objects:: + + print(soup.a.prettify()) + # + # I linked to + # + # example.com + # + # + +Non-pretty printing +------------------- + +If you just want a string, with no fancy formatting, you can call +``unicode()`` or ``str()`` on a ``BeautifulSoup`` object, or a ``Tag`` +within it:: + + str(soup) + # 'I linked to example.com' + + unicode(soup.a) + # u'I linked to example.com' + +The ``str()`` function returns a string encoded in UTF-8. See +`Encodings`_ for other options. + +You can also call ``encode()`` to get a bytestring, and ``decode()`` +to get Unicode. + +Output formatters +----------------- + +If you give Beautiful Soup a document that contains HTML entities like +"&lquot;", they'll be converted to Unicode characters:: + + soup = BeautifulSoup("“Dammit!” he said.") + unicode(soup) + # u'\u201cDammit!\u201d he said.' + +If you then convert the document to a string, the Unicode characters +will be encoded as UTF-8. You won't get the HTML entities back:: + + str(soup) + # '\xe2\x80\x9cDammit!\xe2\x80\x9d he said.' + +By default, the only characters that are escaped upon output are bare +ampersands and angle brackets. These get turned into "&", "<", +and ">", so that Beautiful Soup doesn't inadvertently generate +invalid HTML or XML:: + + soup = BeautifulSoup("
The law firm of Dewey, Cheatem, & Howe
") + soup.p + #
The law firm of Dewey, Cheatem, & Howe
+ +You can change this behavior by providing a value for the +``formatter`` argument to ``prettify()``, ``encode()``, or +``decode()``. Beautiful Soup recognizes four possible values for +``formatter``. + +The default is ``formatter="minimal"``. Strings will only be processed +enough to ensure that Beautiful Soup generates valid HTML/XML:: + + french = "
Il a dit <<Sacré bleu!>>
" + soup = BeautifulSoup(french) + print(soup.prettify(formatter="minimal")) + # + # + #
+ # Il a dit <<Sacré bleu!>> + #
+ # + # + +If you pass in ``formatter="html"``, Beautiful Soup will convert +Unicode characters to HTML entities whenever possible:: + + print(soup.prettify(formatter="html")) + # + # + #
+ # Il a dit <<Sacré bleu!>> + #
+ # + # + +If you pass in ``formatter=None``, Beautiful Soup will not modify +strings at all on output. This is the fastest option, but it may lead +to Beautiful Soup generating invalid HTML/XML, as in this example:: + + print(soup.prettify(formatter=None)) + # + # + #
+ # Il a dit <> + #
+ # + # + + +Finally, if you pass in a function for ``formatter``, Beautiful Soup +will call that function once for every string in the document. You can +do whatever you want in this function. Here's a formatter that +converts strings to uppercase and does absolutely nothing else:: + + def uppercase(str): + return str.upper() + + print(soup.prettify(formatter=uppercase)) + # + # + #
+ # IL A DIT <> + #
+ # + # + +If you're writing your own function, you should know about the +``EntitySubstitution`` class in the ``bs4.dammit`` module. This class +implements Beautiful Soup's standard formatters as class methods: the +"html" formatter is ``EntitySubstitution.substitute_html``, and the +"minimal" formatter is ``EntitySubstitution.substitute_xml``. You can +use these functions to simulate ``formatter=html`` or +``formatter==minimal`` but and then do something in addition. + +Here's an example that converts strings to uppercase, `and` replaces +Unicode characters with HTML entities whenever possible:: + + from bs4.dammit import EntitySubstitution + def uppercase_and_substitute_html_entities(str): + return EntitySubstitution.substitute_html(str.upper()) + + print(soup.prettify(formatter=uppercase_and_substitute_html_entities)) + # + # + #
+ # IL A DIT <<SACRÉ BLEU!>> + #
+ # + # + +``get_text()`` +-------------- + +If you only want the text part of a document or tag, you can use the +``get_text()`` method. It returns all the text in a document or +beneath a tag, as a single Unicode string:: + + markup = '\nI linked to example.com\n' + soup = BeautifulSoup(markup) + + soup.get_text() + u'\nI linked to example.com\n' + soup.i.get_text() + u'example.com' + +You can specify a string to be used to join the bits of text +together:: + + # soup.get_text("|") + u'\nI linked to |example.com|\n' + +You can tell Beautiful Soup to strip whitespace from the beginning and +end of each bit of text:: + + # soup.get_text("|", strip=True) + u'I linked to|example.com' + +But at that point you might want to use the :ref:`.stripped_strings ` +generator instead, and process the text yourself:: + + [text for text in soup.stripped_strings] + # [u'I linked to', u'example.com'] + +Choosing a parser +================= + +If you just need to parse some HTML, you can dump the markup into the +``BeautifulSoup`` constructor, and it'll probably be fine. Beautiful +Soup will pick a parser for you and parse the data. But there are a +few additional arguments you can pass in to the constructor to change +which parser is used. + +The first argument to the ``BeautifulSoup`` constructor is a string or +an open filehandle--the markup you want parsed. The second argument is +`how` you'd like the markup parsed. + +If you don't specify anything, you'll get the best HTML parser that's +installed. Beautiful Soup ranks lxml's parser as being the best, then +html5lib's, then Python's built-in parser. You can override this by +specifying one of the following: + +* What type of markup you want to parse. Currently supported are + "html", "xml", and "html5". + +* The name of the parser library you want to use. Currently supported + options are "lxml", "html5lib", and "html.parser" (Python's + built-in HTML parser). + +Some examples:: + + BeautifulSoup(markup, "lxml") + BeautifulSoup(markup, "xml") + BeautifulSoup(markup, "html5") + +You can specify a list of the parser features you want, instead of +just one. Right now this is mostly useful for distinguishing between +lxml's HTML parser and its XML parser:: + + BeautifulSoup(markup, ["html", "lxml"]) + BeautifulSoup(markup, ["xml", "lxml"]) + +If you don't have an appropriate parser installed, Beautiful Soup will +ignore your request and pick a different parser. For instance, right +now the only supported XML parser is lxml, so if you don't have lxml +installed, asking for an XML parser won't give you one, and asking for +"lxml" won't work either. + +Why would you use one parser over another? Because different parsers +will create different parse trees from the same document. The biggest +differences are between HTML parsers and XML parsers. Here's a short +document, parsed as HTML:: + + BeautifulSoup("") + # + +Since an empty tag is not valid HTML, the parser turns it into a + tag pair. + +Here's the same document parsed as XML (running this requires that you +have lxml installed). Note that the empty tag is left alone, and +that the document is given an XML declaration instead of being put +into an tag.:: + + BeautifulSoup("", "xml") + # + # + +There are also differences between HTML parsers. If you give Beautiful +Soup a perfectly-formed HTML document, these differences won't +matter. One parser may be faster than another, but they'll all give +you a data structure that looks exactly like the original HTML +document. + +But if the document is not perfectly-formed, different parsers will +give different results. Here's a short, invalid document parsed using +lxml's HTML parser. Note that the dangling
tag is simply +ignored:: + + BeautifulSoup("
", "lxml") + # + +Here's the same document parsed using html5lib:: + + BeautifulSoup("
", "html5lib") + #
+ +Instead of ignoring the dangling
tag, html5lib pairs it with an +opening
tag. This parser also adds an empty tag to the +document. + +Here's the same document parsed with Python's built-in HTML +parser:: + + BeautifulSoup("
", "html.parser") + # + +Like html5lib, this parser ignores the closing
tag. Unlike +html5lib, this parser makes no attempt to create a well-formed HTML +document by adding a tag. Unlike lxml, it doesn't even bother +to add an tag. + +Since the document "
" is invalid, none of these techniques is +the "correct" way to handle it. The html5lib parser uses techniques +that are part of the HTML5 standard, so it has the best claim on being +the "correct" way, but all three techniques are leigtimate. + +Differences between parsers can affect your script. If you're planning +on distributing your script to other people, you might want to specify +in the ``BeautifulSoup`` constructor which parser you used during +development. That will reduce the chances that your users parse a +document differently from the way you parse it. + + +Encodings +========= + +Any HTML or XML document is written in a specific encoding like ASCII +or UTF-8. But when you load that document into Beautiful Soup, you'll +discover it's been converted to Unicode:: + + markup = "
Sacr\xc3\xa9 bleu!
" + soup = BeautifulSoup(markup) + soup.h1 + #
Sacré bleu!
+ soup.h1.string + # u'Sacr\xe9 bleu!' + +It's not magic. (That sure would be nice.) Beautiful Soup uses a +sub-library called `Unicode, Dammit`_ to detect a document's encoding +and convert it to Unicode. The autodetected encoding is available as +the ``.original_encoding`` attribute of the ``BeautifulSoup`` object:: + + soup.original_encoding + 'utf-8' + +Unicode, Dammit guesses correctly most of the time, but sometimes it +makes mistakes. Sometimes it guesses correctly, but only after a +byte-by-byte search of the document that takes a very long time. If +you happen to know a document's encoding ahead of time, you can avoid +mistakes and delays by passing it to the ``BeautifulSoup`` constructor +as ``from_encoding``. + +Here's a document written in ISO-8859-8. The document is so short that +Unicode, Dammit can't get a good lock on it, and misidentifies it as +ISO-8859-7:: + + markup = b"
\xed\xe5\xec\xf9
" + soup = BeautifulSoup(markup) + soup.h1 +
νεμω
+ soup.original_encoding + 'ISO-8859-7' + +We can fix this by passing in the correct ``from_encoding``:: + + soup = BeautifulSoup(markup, from_encoding="iso-8859-8") + soup.h1 +
םולש
+ soup.original_encoding + 'iso8859-8' + +In rare cases (usually when a UTF-8 document contains text written in +a completely different encoding), the only way to get Unicode may be +to replace some characters with the special Unicode character +"REPLACEMENT CHARACTER" (U+FFFD, �). If Unicode, Dammit needs to do +this, it will set the ``.contains_replacement_characters`` attribute +to ``True`` on the ``UnicodeDammit`` or ``BeautifulSoup`` object. This +lets you know that the Unicode representation is not an exact +representation of the original--some data was lost. If a document +contains �, but ``.contains_replacement_characters`` if ``False``, +you'll know that the � was there originally (as it is in this +paragrpah) and doesn't stand in for missing data. + +Output encoding +--------------- + +When you write out a document from Beautiful Soup, you get a UTF-8 +document, even if the document wasn't in UTF-8 to begin with. Here's a +document written in the Latin-1 encoding:: + + markup = b''' + + + + + +
Sacr\xe9 bleu!
+ + + ''' + + soup = BeautifulSoup(markup) + print(soup.prettify()) + # + # + # + # + # + #
+ # Sacré bleu! + #
+ # + # + +Note that the tag has been rewritten to reflect the fact that +the document is now in UTF-8. + +If you don't want UTF-8, you can pass an encoding into ``prettify()``:: + + print(soup.prettify("latin-1")) + # + # + # + # ... + +You can also call encode() on the ``BeautifulSoup`` object, or any +element in the soup, just as if it were a Python string:: + + soup.p.encode("latin-1") + # '
Sacr\xe9 bleu!
' + + soup.p.encode("utf-8") + # '
Sacr\xc3\xa9 bleu!
' + +Any characters that can't be represented in your chosen encoding will +be converted into numeric XML entity references. For instance, here's +a document that includes the Unicode character SNOWMAN:: + + markup = u"\N{SNOWMAN}" + snowman_soup = BeautifulSoup(markup) + tag = snowman_soup.b + +The SNOWMAN character can be part of a UTF-8 document (it looks like +☃), but there's no representation for that character in ISO-Latin-1 or +ASCII, so it's converted into "☃" for those encodings:: + + print(tag.encode("utf-8")) + # ☃ + + print tag.encode("latin-1") + # ☃ + + print tag.encode("ascii") + # ☃ + +Unicode, Dammit +--------------- + +You can use Unicode, Dammit without using Beautiful Soup. It's useful +whenever you have data in an unknown encoding and you just want it to +become Unicode:: + + from bs4 import UnicodeDammit + dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'utf-8' + +The more data you give Unicode, Dammit, the more accurately it will +guess. If you have your own suspicions as to what the encoding might +be, you can pass them in as a list:: + + dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) + print(dammit.unicode_markup) + # Sacré bleu! + dammit.original_encoding + # 'latin-1' + +Unicode, Dammit has one special feature that Beautiful Soup doesn't +use. You can use it to convert Microsoft smart quotes to HTML or XML +entities:: + + markup = b"
I just \x93love\x94 Microsoft Word
" + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="html").unicode_markup + # u'
I just “love” Microsoft Word
' + + UnicodeDammit(markup, ["windows-1252"], smart_quotes_to="xml").unicode_markup + # u'
I just “love” Microsoft Word
' + +You might find this feature useful, but Beautiful Soup doesn't use +it. Beautiful Soup prefers the default behavior, which is to convert +Microsoft smart quotes to Unicode characters along with everything +else:: + + UnicodeDammit(markup, ["windows-1252"]).unicode_markup + # u'
I just \u201clove\u201d Microsoft Word
' + +Parsing only part of a document +=============================== + +Let's say you want to use Beautiful Soup look at a document's +tags. It's a waste of time and memory to parse the entire document and +then go over it again looking for tags. It would be much faster to +ignore everthing that wasn't an tag in the first place. The +``SoupStrainer`` class allows you to choose which parts of an incoming +document are parsed. You just create a ``SoupStrainer`` and pass it in +to the ``BeautifulSoup`` constructor as the ``parse_only`` argument. + +(Note that *this feature won't work if you're using the html5lib +parser*. If you use html5lib, the whole document will be parsed, no +matter what. In the examples below, I'll be forcing Beautiful Soup to +use Python's built-in parser.) + +``SoupStrainer`` +---------------- + +The ``SoupStrainer`` class takes the same arguments as a typical +method from `Searching the tree`_: :ref:`name `, :ref:`attrs +`, :ref:`text `, and :ref:`**kwargs `. Here are +three ``SoupStrainer`` objects:: + + from bs4 import SoupStrainer + + only_a_tags = SoupStrainer("a") + + only_tags_with_id_link2 = SoupStrainer(id="link2") + + def is_short_string(string): + return len(string) < 10 + + only_short_strings = SoupStrainer(text=is_short_string) + +I'm going to bring back the "three sisters" document one more time, +and we'll see what the document looks like when it's parsed with these +three ``SoupStrainer`` objects:: + + html_doc = """ + The Dormouse's story + +
The Dormouse's story
+ +
Once upon a time there were three little sisters; and their names were + Elsie, + Lacie and + Tillie; + and they lived at the bottom of a well.
+ +
...
+ """ + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify()) + # + # Elsie + # + # + # Lacie + # + # + # Tillie + # + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) + # + # Lacie + # + + print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) + # Elsie + # , + # Lacie + # and + # Tillie + # ... + # + +You can also pass a ``SoupStrainer`` into any of the methods covered +in `Searching the tree`_. This probably isn't terribly useful, but I +thought I'd mention it:: + + soup = BeautifulSoup(html_doc) + soup.find_all(only_short_strings) + # [u'\n\n', u'\n\n', u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie', + # u'\n\n', u'...', u'\n'] + +Troubleshooting +=============== + +Parsing XML +----------- + +By default, Beautiful Soup parses documents as HTML. To parse a +document as XML, pass in "xml" as the second argument to the +``BeautifulSoup`` constructor:: + + soup = BeautifulSoup(markup, "xml") + +You'll need to :ref:`have lxml installed `. + +Improving Performance +--------------------- + +Beautiful Soup will never be as fast as the parsers it sits on top +of. If response time is critical, if you're paying for computer time +by the hour, or if there's any other reason why computer time is more +valuable than programmer time, you should forget about Beautiful Soup +and work directly atop `lxml `_. + +That said, there are things you can do to speed up Beautiful Soup. If +you're not using lxml as the underlying parser, my advice is to +:ref:`start `. Beautiful Soup parses documents +significantly faster using lxml than using html.parser or html5lib. + +Sometimes `Unicode, Dammit`_ can only detect the encoding of a file by +doing a byte-by-byte examination of the file. This slows Beautiful +Soup to a crawl. My tests indicate that this only happened on 2.x +versions of Python, and that it happened most often with documents +using Russian or Chinese encodings. If this is happening to you, you +can fix it by using Python 3 for your script. Or, if you happen to +know a document's encoding, you can pass it into the +``BeautifulSoup`` constructor as ``from_encoding``. + +`Parsing only part of a document`_ won't save you much time parsing +the document, but it can save a lot of memory, and it'll make +`searching` the document much faster. + +Beautiful Soup 3 +================ + +Beautiful Soup 3.2.0 is the old version, the last release of the +Beautiful Soup 3 series. It's currently the version packaged with all +major Linux distributions:: + +:kbd:`$ apt-get install python-beautifulsoup` + +It's also published through PyPi as `BeautifulSoup`.:: + +:kbd:`$ easy_install BeautifulSoup` + +:kbd:`$ pip install BeautifulSoup` + +You can also `download a tarball of Beautiful Soup 3.2.0 +`_. + +If you ran ``easy_install beautifulsoup`` or ``easy_install +BeautifulSoup``, but your code doesn't work, you installed Beautiful +Soup 3 by mistake. You need to run ``easy_install beautifulsoup4``. + +`The documentation for Beautiful Soup 3 is archived online +`_. If +your first language is Chinese, it might be easier for you to read +`the Chinese translation of the Beautiful Soup 3 documentation +`_, +then read this document to find out about the changes made in +Beautiful Soup 4. + +Porting code to BS4 +------------------- + +Most code written against Beautiful Soup 3 will work against Beautiful +Soup 4 with one simple change. All you should have to do is change the +package name from ``BeautifulSoup`` to ``bs4``. So this:: + + from BeautifulSoup import BeautifulSoup + +becomes this:: + + from bs4 import BeautifulSoup + +* If you get the ``ImportError`` "No module named BeautifulSoup", your + problem is that you're trying to run Beautiful Soup 3 code, but you + only have Beautiful Soup 4 installed. + +* If you get the ``ImportError`` "No module named bs4", your problem + is that you're trying to run Beautiful Soup 4 code, but you only + have Beautiful Soup 3 installed. + +Although BS4 is mostly backwards-compatible with BS3, most of its +methods have been deprecated and given new names for `PEP 8 compliance +`_. There are numerous other +renames and changes, and a few of them break backwards compatibility. + +Here's what you'll need to know to convert your BS3 code and habits to BS4: + +You need a parser +^^^^^^^^^^^^^^^^^ + +Beautiful Soup 3 used Python's ``SGMLParser``, a module that was +deprecated and removed in Python 3.0. Beautiful Soup 4 uses +``html.parser`` by default, but you can plug in lxml or html5lib and +use that instead. Until ``html.parser`` is improved to handle +real-world HTML better, that's what I recommend you do. See `Be sure +to install a good parser!`_ + +Method names +^^^^^^^^^^^^ + +* ``replaceWith`` -> ``replace_with`` +* ``replaceWithChildren`` -> ``replace_with_children`` +* ``findAll`` -> ``find_all`` +* ``findAllNext`` -> ``find_all_next`` +* ``findAllPrevious`` -> ``find_all_previous`` +* ``findNext`` -> ``find_next`` +* ``findNextSibling`` -> ``find_next_sibling`` +* ``findNextSiblings`` -> ``find_next_siblings`` +* ``findParent`` -> ``find_parent`` +* ``findParents`` -> ``find_parents`` +* ``findPrevious`` -> ``find_previous`` +* ``findPreviousSibling`` -> ``find_previous_sibling`` +* ``findPreviousSiblings`` -> ``find_previous_siblings`` +* ``nextSibling`` -> ``next_sibling`` +* ``previousSibling`` -> ``previous_sibling`` + +Some arguments to the Beautiful Soup constructor were renamed for the +same reasons: + +* ``BeautifulSoup(parseOnlyThese=...)`` -> ``BeautifulSoup(parse_only=...)`` +* ``BeautifulSoup(fromEncoding=...)`` -> ``BeautifulSoup(from_encoding=...)`` + +I renamed one method for compatibility with Python 3: + +* ``Tag.has_key()`` -> ``Tag.has_attr()`` + +I renamed one attribute to use more accurate terminology: + +* ``Tag.isSelfClosing`` -> ``Tag.is_empty_element`` + +I renamed three attributes to avoid using words that have special +meaning to Python. Unlike the others, these changes are *not backwards +compatible.* If you used these attributes in BS3, your code will break +on BS4 until you change them. + +* ``UnicodeDammit.unicode`` -> ``UnicodeDammit.unicode_markup`` +* ``Tag.next`` -> ``Tag.next_element`` +* ``Tag.previous`` -> ``Tag.previous_element`` + +Generators +^^^^^^^^^^ + +I gave the generators PEP 8-compliant names, and transformed them into +properties: + +* ``childGenerator()`` -> ``children`` +* ``nextGenerator()`` -> ``next_elements`` +* ``nextSiblingGenerator()`` -> ``next_siblings`` +* ``previousGenerator()`` -> ``previous_elements`` +* ``previousSiblingGenerator()`` -> ``previous_siblings`` +* ``recursiveChildGenerator()`` -> ``descendants`` +* ``parentGenerator()`` -> ``parents`` + +So instead of this:: + + for parent in tag.parentGenerator(): + ... + +You can write this:: + + for parent in tag.parents: + ... + +(But the old code will still work.) + +Some of the generators used to yield ``None`` after they were done, and +then stop. That was a bug. Now the generators just stop. + +There are two new generators, :ref:`.strings and +.stripped_strings `. ``.strings`` yields +NavigableString objects, and ``.stripped_strings`` yields Python +strings that have had whitespace stripped. + +XML +^^^ + +There is no longer a ``BeautifulStoneSoup`` class for parsing XML. To +parse XML you pass in "xml" as the second argument to the +``BeautifulSoup`` constructor. For the same reason, the +``BeautifulSoup`` constructor no longer recognizes the ``isHTML`` +argument. + +Beautiful Soup's handling of empty-element XML tags has been +improved. Previously when you parsed XML you had to explicitly say +which tags were considered empty-element tags. The ``selfClosingTags`` +argument to the constructor is no longer recognized. Instead, +Beautiful Soup considers any empty tag to be an empty-element tag. If +you add a child to an empty-element tag, it stops being an +empty-element tag. + +Entities +^^^^^^^^ + +An incoming HTML or XML entity is always converted into the +corresponding Unicode character. Beautiful Soup 3 had a number of +overlapping ways of dealing with entities, which have been +removed. The ``BeautifulSoup`` constructor no longer recognizes the +``smartQuotesTo`` or ``convertEntities`` arguments. (`Unicode, +Dammit`_ still has ``smart_quotes_to``, but its default is now to turn +smart quotes into Unicode.) + +If you want to turn those Unicode characters back into HTML entities +on output, rather than turning them into UTF-8 characters, you need to +use ``.encode``, as described in `Substituting HTML entities`. This +may change before the final release. + +Miscellaneous +^^^^^^^^^^^^^ + +:ref:`Tag.string <.string>` now operates recursively. If tag A +contains a single tag B and nothing else, then A.string is the same as +B.string. (Previously, it was None.) + +`Multi-valued attributes`_ like ``class`` have lists of strings as +their values, not strings. This may affect the way you search by CSS +class. + +If you pass one of the ``find*`` methods both :ref:`text ` `and` +a tag-specific argument like :ref:`name `, Beautiful Soup will +search for tags that match your tag-specific criteria and whose +:ref:`Tag.string <.string>` matches your value for :ref:`text +`. It will `not` find the strings themselves. Previously, +Beautiful Soup ignored the tag-specific arguments and looked for +strings. + +The ``BeautifulSoup`` constructor no longer recognizes the +`markupMassage` argument. It's now the parser's responsibility to +handle markup correctly. + +The rarely-used alternate parser classes like +``ICantBelieveItsBeautifulSoup`` and ``BeautifulSOAP`` have been +removed. It's now the parser's decision how to handle ambiguous +markup. -- cgit v1.2.3