summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2011-02-20 19:30:01 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2011-02-20 19:30:01 -0500
commitc3090d7e7337f88853fc5371c6d8011eb638c37f (patch)
treecfdc90333c894ade4e2dad99e16329253be5fea5
parent39a2b266b634aa2eca4329a6719e090087113f46 (diff)
Renamed constructor arguments to comply with PEP 8.
-rw-r--r--CHANGELOG5
-rw-r--r--beautifulsoup/__init__.py56
-rw-r--r--beautifulsoup/builder/__init__.py2
-rw-r--r--tests/test_html5lib.py4
-rw-r--r--tests/test_lxml.py4
-rw-r--r--tests/test_soup.py2
-rw-r--r--tests/test_tree.py2
7 files changed, 30 insertions, 45 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 3fb4f36..c9a4ca7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -21,6 +21,11 @@ Some attributes have also been renamed:
* Tag.isSelfClosing -> Tag.is_empty_element
+So have some arguments to popular methods:
+
+ * BeautifulSoup(parseOnlyThese=...) -> BeautifulSoup(parse_only=...)
+ * BeautifulSoup(fromEncoding=...) -> BeautifulSoup(from_encoding=...)
+
== Generators are now properties ==
The generators have been given more sensible (and PEP 8-compliant)
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 968be08..b8598e2 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -3,34 +3,14 @@ Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/
-Beautiful Soup parses a (possibly invalid) XML or HTML document into a
-tree representation. It provides methods and Pythonic idioms that make
-it easy to navigate, search, and modify the tree.
+Beautiful Soup uses a plug-in parser to parse a (possibly invalid) XML
+or HTML document into a tree representation. The parser does the work
+of building a parse tree, and Beautiful Soup provides provides methods
+and Pythonic idioms that make it easy to navigate, search, and modify
+the parse tree.
-A well-formed XML/HTML document yields a well-formed data
-structure. An ill-formed XML/HTML document yields a correspondingly
-ill-formed data structure. If your document is only locally
-well-formed, you can use this library to find and process the
-well-formed part of it.
-
-Beautiful Soup works with Python 2.2 and up. It has no external
-dependencies, but you'll have more success at converting data to UTF-8
-if you also install these three packages:
-
-* chardet, for auto-detecting character encodings
- http://chardet.feedparser.org/
-* cjkcodecs and iconv_codec, which add more encodings to the ones supported
- by stock Python.
- http://cjkpython.i18n.org/
-
-Beautiful Soup defines classes for two main parsing strategies:
-
- * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
- language that kind of looks like XML.
-
- * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
- or invalid. This class has web browser-like heuristics for
- obtaining a sensible parse tree in the face of common HTML errors.
+Beautiful Soup works with Python 2.5 and up. To get it to work, you
+must install either lxml or html5lib.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
@@ -38,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/documentation.html
Here, have some legalese:
-Copyright (c) 2004-2009, Leonard Richardson
+Copyright (c) 2004-2011, Leonard Richardson
All rights reserved.
@@ -127,8 +107,8 @@ class BeautifulSoup(Tag):
from builder import LXMLTreeBuilder
return LXMLTreeBuilder()
- def __init__(self, markup="", builder=None, parseOnlyThese=None,
- fromEncoding=None):
+ def __init__(self, markup="", builder=None, parse_only=None,
+ from_encoding=None):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
@@ -138,14 +118,14 @@ class BeautifulSoup(Tag):
self.builder = builder
self.builder.soup = self
- self.parseOnlyThese = parseOnlyThese
+ self.parse_only = parse_only
self.reset()
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
self.markup, self.original_encoding, self.declared_html_encoding = (
- self.builder.prepare_markup(markup, fromEncoding))
+ self.builder.prepare_markup(markup, from_encoding))
try:
self._feed()
@@ -201,9 +181,9 @@ class BeautifulSoup(Tag):
else:
currentData = ' '
self.currentData = []
- if self.parseOnlyThese and len(self.tagStack) <= 1 and \
- (not self.parseOnlyThese.text or \
- not self.parseOnlyThese.search(currentData)):
+ if self.parse_only and len(self.tagStack) <= 1 and \
+ (not self.parse_only.text or \
+ not self.parse_only.search(currentData)):
return
o = containerClass(currentData)
self.object_was_parsed(o)
@@ -251,9 +231,9 @@ class BeautifulSoup(Tag):
#print "Start tag %s: %s" % (name, attrs)
self.endData()
- if (self.parseOnlyThese and len(self.tagStack) <= 1
- and (self.parseOnlyThese.text
- or not self.parseOnlyThese.searchTag(name, attrs))):
+ if (self.parse_only and len(self.tagStack) <= 1
+ and (self.parse_only.text
+ or not self.parse_only.searchTag(name, attrs))):
return None
tag = Tag(self, self.builder, name, attrs, self.currentTag,
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 522960a..854cc56 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -208,7 +208,7 @@ class HTMLTreeBuilder(TreeBuilder):
match = self.CHARSET_RE.search(content)
if match:
if (self.soup.declared_html_encoding is not None or
- self.soup.original_encoding == self.soup.fromEncoding):
+ self.soup.original_encoding == self.soup.from_encoding):
# An HTML encoding was sniffed while converting
# the document to Unicode, or an HTML encoding was
# sniffed during a previous pass through the
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index aa0bad2..5abc29d 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -18,7 +18,7 @@ class TestHTML5Builder(TestLXMLBuilder):
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
soup = self.soup(markup,
- parseOnlyThese=strainer)
+ parse_only=strainer)
self.assertEquals(
soup.decode(), self.document_for(markup))
@@ -210,7 +210,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion):
# A real-world test to make sure we can convert ISO-8859-9 (a
# Hebrew encoding) to UTF-8.
soup = self.soup(self.HEBREW_DOCUMENT,
- fromEncoding="iso-8859-8")
+ from_encoding="iso-8859-8")
self.assertEquals(soup.original_encoding, 'iso8859-8')
self.assertEquals(
soup.encode('utf-8'),
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 9d08aef..df2f341 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -325,7 +325,7 @@ class TestLXMLBuilder(SoupTest):
def test_soupstrainer(self):
strainer = SoupStrainer("b")
soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>",
- parseOnlyThese=strainer)
+ parse_only=strainer)
self.assertEquals(soup.decode(), "<b>bold</b>")
@@ -506,7 +506,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest):
# A real-world test to make sure we can convert ISO-8859-9 (a
# Hebrew encoding) to UTF-8.
soup = self.soup(self.HEBREW_DOCUMENT,
- fromEncoding="iso-8859-8")
+ from_encoding="iso-8859-8")
self.assertEquals(soup.original_encoding, 'iso-8859-8')
self.assertEquals(
soup.encode('utf-8'),
diff --git a/tests/test_soup.py b/tests/test_soup.py
index 01dff53..bb2262a 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -12,7 +12,7 @@ class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
strainer = SoupStrainer("b")
- soup = self.soup(markup, parseOnlyThese=strainer)
+ soup = self.soup(markup, parse_only=strainer)
self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>")
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 384d518..cefdf4a 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -863,7 +863,7 @@ class TestSubstitutions(SoupTest):
# meta tag got filtered out by the strainer. This test makes
# sure that doesn't happen.
strainer = SoupStrainer('pre')
- soup = self.soup(markup, parseOnlyThese=strainer)
+ soup = self.soup(markup, parse_only=strainer)
self.assertEquals(soup.contents[0].name, 'pre')