diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 19:30:01 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-20 19:30:01 -0500 |
commit | c3090d7e7337f88853fc5371c6d8011eb638c37f (patch) | |
tree | cfdc90333c894ade4e2dad99e16329253be5fea5 | |
parent | 39a2b266b634aa2eca4329a6719e090087113f46 (diff) |
Renamed constructor arguments to comply with PEP 8.
-rw-r--r-- | CHANGELOG | 5 | ||||
-rw-r--r-- | beautifulsoup/__init__.py | 56 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 2 | ||||
-rw-r--r-- | tests/test_html5lib.py | 4 | ||||
-rw-r--r-- | tests/test_lxml.py | 4 | ||||
-rw-r--r-- | tests/test_soup.py | 2 | ||||
-rw-r--r-- | tests/test_tree.py | 2 |
7 files changed, 30 insertions, 45 deletions
@@ -21,6 +21,11 @@ Some attributes have also been renamed: * Tag.isSelfClosing -> Tag.is_empty_element +So have some arguments to popular methods: + + * BeautifulSoup(parseOnlyThese=...) -> BeautifulSoup(parse_only=...) + * BeautifulSoup(fromEncoding=...) -> BeautifulSoup(from_encoding=...) + == Generators are now properties == The generators have been given more sensible (and PEP 8-compliant) diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 968be08..b8598e2 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -3,34 +3,14 @@ Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ -Beautiful Soup parses a (possibly invalid) XML or HTML document into a -tree representation. It provides methods and Pythonic idioms that make -it easy to navigate, search, and modify the tree. +Beautiful Soup uses a plug-in parser to parse a (possibly invalid) XML +or HTML document into a tree representation. The parser does the work +of building a parse tree, and Beautiful Soup provides provides methods +and Pythonic idioms that make it easy to navigate, search, and modify +the parse tree. -A well-formed XML/HTML document yields a well-formed data -structure. An ill-formed XML/HTML document yields a correspondingly -ill-formed data structure. If your document is only locally -well-formed, you can use this library to find and process the -well-formed part of it. - -Beautiful Soup works with Python 2.2 and up. It has no external -dependencies, but you'll have more success at converting data to UTF-8 -if you also install these three packages: - -* chardet, for auto-detecting character encodings - http://chardet.feedparser.org/ -* cjkcodecs and iconv_codec, which add more encodings to the ones supported - by stock Python. - http://cjkpython.i18n.org/ - -Beautiful Soup defines classes for two main parsing strategies: - - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific - language that kind of looks like XML. - - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid - or invalid. This class has web browser-like heuristics for - obtaining a sensible parse tree in the face of common HTML errors. +Beautiful Soup works with Python 2.5 and up. To get it to work, you +must install either lxml or html5lib. For more than you ever wanted to know about Beautiful Soup, see the documentation: @@ -38,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/documentation.html Here, have some legalese: -Copyright (c) 2004-2009, Leonard Richardson +Copyright (c) 2004-2011, Leonard Richardson All rights reserved. @@ -127,8 +107,8 @@ class BeautifulSoup(Tag): from builder import LXMLTreeBuilder return LXMLTreeBuilder() - def __init__(self, markup="", builder=None, parseOnlyThese=None, - fromEncoding=None): + def __init__(self, markup="", builder=None, parse_only=None, + from_encoding=None): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" @@ -138,14 +118,14 @@ class BeautifulSoup(Tag): self.builder = builder self.builder.soup = self - self.parseOnlyThese = parseOnlyThese + self.parse_only = parse_only self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup, self.original_encoding, self.declared_html_encoding = ( - self.builder.prepare_markup(markup, fromEncoding)) + self.builder.prepare_markup(markup, from_encoding)) try: self._feed() @@ -201,9 +181,9 @@ class BeautifulSoup(Tag): else: currentData = ' ' self.currentData = [] - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(currentData)): return o = containerClass(currentData) self.object_was_parsed(o) @@ -251,9 +231,9 @@ class BeautifulSoup(Tag): #print "Start tag %s: %s" % (name, attrs) self.endData() - if (self.parseOnlyThese and len(self.tagStack) <= 1 - and (self.parseOnlyThese.text - or not self.parseOnlyThese.searchTag(name, attrs))): + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.searchTag(name, attrs))): return None tag = Tag(self, self.builder, name, attrs, self.currentTag, diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 522960a..854cc56 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -208,7 +208,7 @@ class HTMLTreeBuilder(TreeBuilder): match = self.CHARSET_RE.search(content) if match: if (self.soup.declared_html_encoding is not None or - self.soup.original_encoding == self.soup.fromEncoding): + self.soup.original_encoding == self.soup.from_encoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index aa0bad2..5abc29d 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -18,7 +18,7 @@ class TestHTML5Builder(TestLXMLBuilder): strainer = SoupStrainer("b") markup = "<p>A <b>bold</b> statement.</p>" soup = self.soup(markup, - parseOnlyThese=strainer) + parse_only=strainer) self.assertEquals( soup.decode(), self.document_for(markup)) @@ -210,7 +210,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, - fromEncoding="iso-8859-8") + from_encoding="iso-8859-8") self.assertEquals(soup.original_encoding, 'iso8859-8') self.assertEquals( soup.encode('utf-8'), diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 9d08aef..df2f341 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -325,7 +325,7 @@ class TestLXMLBuilder(SoupTest): def test_soupstrainer(self): strainer = SoupStrainer("b") soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>", - parseOnlyThese=strainer) + parse_only=strainer) self.assertEquals(soup.decode(), "<b>bold</b>") @@ -506,7 +506,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, - fromEncoding="iso-8859-8") + from_encoding="iso-8859-8") self.assertEquals(soup.original_encoding, 'iso-8859-8') self.assertEquals( soup.encode('utf-8'), diff --git a/tests/test_soup.py b/tests/test_soup.py index 01dff53..bb2262a 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -12,7 +12,7 @@ class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" strainer = SoupStrainer("b") - soup = self.soup(markup, parseOnlyThese=strainer) + soup = self.soup(markup, parse_only=strainer) self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>") diff --git a/tests/test_tree.py b/tests/test_tree.py index 384d518..cefdf4a 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -863,7 +863,7 @@ class TestSubstitutions(SoupTest): # meta tag got filtered out by the strainer. This test makes # sure that doesn't happen. strainer = SoupStrainer('pre') - soup = self.soup(markup, parseOnlyThese=strainer) + soup = self.soup(markup, parse_only=strainer) self.assertEquals(soup.contents[0].name, 'pre') |