diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2009-04-08 18:01:26 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2009-04-08 18:01:26 -0400 |
commit | 6de21249eab260ecfa17499dc593204db5bd1f43 (patch) | |
tree | d7fef1e714dad702a309d55c8fe98e8f07cb586c /BeautifulSoup.py | |
parent | 5e932f046847c1417b7ff7dab580b56fd214db88 (diff) |
Removed rarely used parsers--if we need them again, we can make tree builders.
Diffstat (limited to 'BeautifulSoup.py')
-rw-r--r-- | BeautifulSoup.py | 99 |
1 files changed, 0 insertions, 99 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py index 918077e..2f8c3ef 100644 --- a/BeautifulSoup.py +++ b/BeautifulSoup.py @@ -1638,105 +1638,6 @@ class BeautifulSoup(BeautifulStoneSoup): class StopParsing(Exception): pass -class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: - - <b>Foo<b>Bar</b></b> - - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "<b>Foo<b>Bar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '</b></b>' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" - - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big'] - - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] - - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) - -class MinimalSoup(BeautifulSoup): - """The MinimalSoup class is for parsing HTML that contains - pathologically bad markup. It makes no assumptions about tag - nesting, but it does know which tags are self-closing, that - <script> tags contain Javascript and should not be parsed, that - META tags may contain encoding information, and so on. - - This also makes it better for subclassing than BeautifulStoneSoup - or BeautifulSoup.""" - - RESET_NESTING_TAGS = buildTagMap('noscript') - NESTABLE_TAGS = {} - -class BeautifulSOAP(BeautifulStoneSoup): - """This class will push a tag with only a single string child into - the tag's parent as an attribute. The attribute's name is the tag - name, and the value is the string child. An example should give - the flavor of the change: - - <foo><bar>baz</bar></foo> - => - <foo bar="baz"><bar>baz</bar></foo> - - You can then access fooTag['bar'] instead of fooTag.barTag.string. - - This is, of course, useful for scraping structures that tend to - use subelements instead of attributes, such as SOAP messages. Note - that it modifies its input, so don't print the modified version - out. - - I'm not sure how many people really want to use this class; let me - know if you do. Mainly I like the name.""" - - def popTag(self): - if len(self.tagStack) > 1: - tag = self.tagStack[-1] - parent = self.tagStack[-2] - parent._getAttrMap() - if (isinstance(tag, Tag) and len(tag.contents) == 1 and - isinstance(tag.contents[0], NavigableString) and - not parent.attrMap.has_key(tag.name)): - parent[tag.name] = tag.contents[0] - BeautifulStoneSoup.popTag(self) - -#Enterprise class names! It has come to our attention that some people -#think the names of the Beautiful Soup parser classes are too silly -#and "unprofessional" for use in enterprise screen-scraping. We feel -#your pain! For such-minded folk, the Beautiful Soup Consortium And -#All-Night Kosher Bakery recommends renaming this file to -#"RobustParser.py" (or, in cases of extreme enterprisiness, -#"RobustParserBeanInterface.class") and using the following -#enterprise-friendly class aliases: -class RobustXMLParser(BeautifulStoneSoup): - pass -class RobustHTMLParser(BeautifulSoup): - pass -class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): - pass -class RobustInsanelyWackAssHTMLParser(MinimalSoup): - pass -class SimplifyingSOAPParser(BeautifulSOAP): - pass - ###################################################### # # Bonus library: Unicode, Dammit |