summaryrefslogtreecommitdiff
path: root/BeautifulSoup.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2009-04-08 18:01:26 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2009-04-08 18:01:26 -0400
commit6de21249eab260ecfa17499dc593204db5bd1f43 (patch)
treed7fef1e714dad702a309d55c8fe98e8f07cb586c /BeautifulSoup.py
parent5e932f046847c1417b7ff7dab580b56fd214db88 (diff)
Removed rarely used parsers--if we need them again, we can make tree builders.
Diffstat (limited to 'BeautifulSoup.py')
-rw-r--r--BeautifulSoup.py99
1 files changed, 0 insertions, 99 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index 918077e..2f8c3ef 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -1638,105 +1638,6 @@ class BeautifulSoup(BeautifulStoneSoup):
class StopParsing(Exception):
pass
-class ICantBelieveItsBeautifulSoup(BeautifulSoup):
-
- """The BeautifulSoup class is oriented towards skipping over
- common HTML errors like unclosed tags. However, sometimes it makes
- errors of its own. For instance, consider this fragment:
-
- <b>Foo<b>Bar</b></b>
-
- This is perfectly valid (if bizarre) HTML. However, the
- BeautifulSoup class will implicitly close the first b tag when it
- encounters the second 'b'. It will think the author wrote
- "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
- there's no real-world reason to bold something that's already
- bold. When it encounters '</b></b>' it will close two more 'b'
- tags, for a grand total of three tags closed instead of two. This
- can throw off the rest of your document structure. The same is
- true of a number of other tags, listed below.
-
- It's much more common for someone to forget to close a 'b' tag
- than to actually use nested 'b' tags, and the BeautifulSoup class
- handles the common case. This class handles the not-co-common
- case: where you can't believe someone wrote what they did, but
- it's valid HTML and BeautifulSoup screwed up by assuming it
- wouldn't be."""
-
- I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
- ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
- 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
- 'big']
-
- I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
-
- NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
- I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
- I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
-
-class MinimalSoup(BeautifulSoup):
- """The MinimalSoup class is for parsing HTML that contains
- pathologically bad markup. It makes no assumptions about tag
- nesting, but it does know which tags are self-closing, that
- <script> tags contain Javascript and should not be parsed, that
- META tags may contain encoding information, and so on.
-
- This also makes it better for subclassing than BeautifulStoneSoup
- or BeautifulSoup."""
-
- RESET_NESTING_TAGS = buildTagMap('noscript')
- NESTABLE_TAGS = {}
-
-class BeautifulSOAP(BeautifulStoneSoup):
- """This class will push a tag with only a single string child into
- the tag's parent as an attribute. The attribute's name is the tag
- name, and the value is the string child. An example should give
- the flavor of the change:
-
- <foo><bar>baz</bar></foo>
- =>
- <foo bar="baz"><bar>baz</bar></foo>
-
- You can then access fooTag['bar'] instead of fooTag.barTag.string.
-
- This is, of course, useful for scraping structures that tend to
- use subelements instead of attributes, such as SOAP messages. Note
- that it modifies its input, so don't print the modified version
- out.
-
- I'm not sure how many people really want to use this class; let me
- know if you do. Mainly I like the name."""
-
- def popTag(self):
- if len(self.tagStack) > 1:
- tag = self.tagStack[-1]
- parent = self.tagStack[-2]
- parent._getAttrMap()
- if (isinstance(tag, Tag) and len(tag.contents) == 1 and
- isinstance(tag.contents[0], NavigableString) and
- not parent.attrMap.has_key(tag.name)):
- parent[tag.name] = tag.contents[0]
- BeautifulStoneSoup.popTag(self)
-
-#Enterprise class names! It has come to our attention that some people
-#think the names of the Beautiful Soup parser classes are too silly
-#and "unprofessional" for use in enterprise screen-scraping. We feel
-#your pain! For such-minded folk, the Beautiful Soup Consortium And
-#All-Night Kosher Bakery recommends renaming this file to
-#"RobustParser.py" (or, in cases of extreme enterprisiness,
-#"RobustParserBeanInterface.class") and using the following
-#enterprise-friendly class aliases:
-class RobustXMLParser(BeautifulStoneSoup):
- pass
-class RobustHTMLParser(BeautifulSoup):
- pass
-class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
- pass
-class RobustInsanelyWackAssHTMLParser(MinimalSoup):
- pass
-class SimplifyingSOAPParser(BeautifulSOAP):
- pass
-
######################################################
#
# Bonus library: Unicode, Dammit