diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/beautifulsoup/AUTHORS | 34 | ||||
-rw-r--r-- | src/beautifulsoup/CHANGELOG | 122 | ||||
-rw-r--r-- | src/beautifulsoup/README | 7 | ||||
-rw-r--r-- | src/beautifulsoup/TODO | 42 | ||||
-rw-r--r-- | src/beautifulsoup/__init__.py | 368 | ||||
-rw-r--r-- | src/beautifulsoup/builder.py | 484 | ||||
-rw-r--r-- | src/beautifulsoup/builder.py.3.diff | 4 | ||||
-rw-r--r-- | src/beautifulsoup/dammit.py | 292 | ||||
-rw-r--r-- | src/beautifulsoup/dammit.py.3.diff | 70 | ||||
-rw-r--r-- | src/beautifulsoup/docs/__init__.py | 16 | ||||
-rw-r--r-- | src/beautifulsoup/element.py | 870 | ||||
-rw-r--r-- | src/beautifulsoup/element.py.3.diff | 8 | ||||
-rw-r--r-- | src/beautifulsoup/tests/__init__.py | 16 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_docs.py | 51 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_docs.py.3.diff | 122 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_soup.py | 854 | ||||
-rw-r--r-- | src/beautifulsoup/util.py | 29 |
17 files changed, 3389 insertions, 0 deletions
diff --git a/src/beautifulsoup/AUTHORS b/src/beautifulsoup/AUTHORS new file mode 100644 index 0000000..d353253 --- /dev/null +++ b/src/beautifulsoup/AUTHORS @@ -0,0 +1,34 @@ +Behold, mortal, the origins of Beautiful Soup... +================================================ + +Leonard Richardson is the primary programmer. + +Sam Ruby helps with a lot of edge cases. + +Mark Pilgrim provided the encoding detection code that forms the base +of UnicodeDammit. + +Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his +work in solving the nestable tags conundrum. + +The following people have contributed patches to Beautiful Soup: + + Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, + Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris + Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, + Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed + Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko + Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn + Webster, Paul Wright, Danny Yoo + +The following people made suggestions or found bugs or found ways to +break Beautiful Soup: + + Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Matt Ernst, + Michael Foord, Tom Harris, Bill de hOra, Donald Howes, Matt + Patterson, Scott Roberts, Steve Strassmann, Mike Williams, warchild + at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, Joren Mc, + Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed Summers, + Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart Turner, Greg + Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de Sousa Rocha, + Yichun Wei, Per Vognsen diff --git a/src/beautifulsoup/CHANGELOG b/src/beautifulsoup/CHANGELOG new file mode 100644 index 0000000..4e97e1b --- /dev/null +++ b/src/beautifulsoup/CHANGELOG @@ -0,0 +1,122 @@ += 3.1.0 = + +A hybrid version that supports 2.4 and can be automatically converted +to run under Python 3.0. There are three backwards-incompatible +changes you should be aware of, but no new features or deliberate +behavior changes. + +1. str() may no longer do what you want. This is because the meaning +of str() inverts between Python 2 and 3; in Python 2 it gives you a +byte string, in Python 3 it gives you a Unicode string. + +The effect of this is that you can't pass an encoding to .__str__ +anymore. Use encode() to get a string and decode() to get Unicode, and +you'll be ready (well, readier) for Python 3. + +2. Beautiful Soup is now based on HTMLParser rather than SGMLParser, +which is gone in Python 3. There's some bad HTML that SGMLParser +handled but HTMLParser doesn't, usually to do with attribute values +that aren't closed or have brackets inside them: + + <a href="foo</a>, </a><a href="bar">baz</a> + <a b="<a>">', '<a b="<a>"></a><a>"></a> + +A later version of Beautiful Soup will allow you to plug in different +parsers to make tradeoffs between speed and the ability to handle bad +HTML. + +3. In Python 3 (but not Python 2),HTMLParser converts entities within +attributes to the corresponding Unicode characters. In Python 2 it's +possible to parse this string and leave the é intact. + + <a href="http://crummy.com?sacré&bleu"> + +In Python 3, the é is always converted to \xe9 during +parsing. + + += 3.0.7a = + +Added an import that makes BS work in Python 2.3. + + += 3.0.7 = + +Fixed a UnicodeDecodeError when unpickling documents that contain +non-ASCII characters. + +Fixed a TypeError that occured in some circumstances when a tag +contained no text. + +Jump through hoops to avoid the use of chardet, which can be extremely +slow in some circumstances. UTF-8 documents should never trigger the +use of chardet. + +Whitespace is preserved inside <pre> and <textarea> tags that contain +nothing but whitespace. + +Beautiful Soup can now parse a doctype that's scoped to an XML namespace. + + += 3.0.6 = + +Got rid of a very old debug line that prevented chardet from working. + +Added a Tag.decompose() method that completely disconnects a tree or a +subset of a tree, breaking it up into bite-sized pieces that are +easy for the garbage collecter to collect. + +Tag.extract() now returns the tag that was extracted. + +Tag.findNext() now does something with the keyword arguments you pass +it instead of dropping them on the floor. + +Fixed a Unicode conversion bug. + +Fixed a bug that garbled some <meta> tags when rewriting them. + + += 3.0.5 = + +Soup objects can now be pickled, and copied with copy.deepcopy. + +Tag.append now works properly on existing BS objects. (It wasn't +originally intended for outside use, but it can be now.) (Giles +Radford) + +Passing in a nonexistent encoding will no longer crash the parser on +Python 2.4 (John Nagle). + +Fixed an underlying bug in SGMLParser that thinks ASCII has 255 +characters instead of 127 (John Nagle). + +Entities are converted more consistently to Unicode characters. + +Entity references in attribute values are now converted to Unicode +characters when appropriate. Numeric entities are always converted, +because SGMLParser always converts them outside of attribute values. + +ALL_ENTITIES happens to just be the XHTML entities, so I renamed it to +XHTML_ENTITIES. + +The regular expression for bare ampersands was too loose. In some +cases ampersands were not being escaped. (Sam Ruby?) + +Non-breaking spaces and other special Unicode space characters are no +longer folded to ASCII spaces. (Robert Leftwich) + +Information inside a TEXTAREA tag is now parsed literally, not as HTML +tags. TEXTAREA now works exactly the same way as SCRIPT. (Zephyr Fang) + + += 3.0.4 = + +Fixed a bug that crashed Unicode conversion in some cases. + +Fixed a bug that prevented UnicodeDammit from being used as a +general-purpose data scrubber. + +Fixed some unit test failures when running against Python 2.5. + +When considering whether to convert smart quotes, UnicodeDammit now +looks at the original encoding in a case-insensitive way. diff --git a/src/beautifulsoup/README b/src/beautifulsoup/README new file mode 100644 index 0000000..8feac71 --- /dev/null +++ b/src/beautifulsoup/README @@ -0,0 +1,7 @@ +The canonical version of Beautiful Soup is the Python 2 version. You +can generate the Python 3 version by running to3.sh, or by doing what +to3.sh does: run 2to3 on BeautifulSoup.py and BeautifulSoupTests.py, +then applying the appropriate .3.diff file to each generated script. + +The testall.sh script tests both the Python 2 version and a freshly +generated Python 3 version.
\ No newline at end of file diff --git a/src/beautifulsoup/TODO b/src/beautifulsoup/TODO new file mode 100644 index 0000000..84fa273 --- /dev/null +++ b/src/beautifulsoup/TODO @@ -0,0 +1,42 @@ +Here are some unit tests that fail with HTMLParser. + + def testValidButBogusDeclarationFAILS(self): + self.assertSoupEquals('<! Foo >a', '<!Foo >a') + + def testIncompleteDeclarationAtEndFAILS(self): + self.assertSoupEquals('a<!b') + + def testIncompleteEntityAtEndFAILS(self): + self.assertSoupEquals('<Hello>') + + # This is not what the original author had in mind, but it's + # a legitimate interpretation of what they wrote. + self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""", + '<a href="foo</a>, </a><a href="></a>, <a href="bar">baz</a>') + # SGMLParser generates bogus parse events when attribute values + # contain embedded brackets, but at least Beautiful Soup fixes + # it up a little. + self.assertSoupEquals('<a b="<a>">', '<a b="<a>"></a><a>"></a>') + self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah', + """<a href='"http://foo.com/'></a><a> and blah and blah</a>""") + + invalidEntity = "foo&#bar;baz" + soup = BeautifulStoneSoup\ + (invalidEntity, + convertEntities=htmlEnt) + self.assertEquals(str(soup), invalidEntity) + + +Tag names that contain Unicode characters crash the parser: + def testUnicodeTagNamesFAILS(self): + self.assertSoupEquals("<f_äf_text>2PM</f_äf_text>") + +Here's the implementation of NavigableString.__unicode__: + + def __unicode__(self): + return unicode(str(self)) + +It converts the Unicode to a string, and then back to Unicode. I can't +find any other way of turning an element of a Unicode subclass into a +normal Unicode object. This is pretty bad and a better technique is +welcome. diff --git a/src/beautifulsoup/__init__.py b/src/beautifulsoup/__init__.py new file mode 100644 index 0000000..0999c07 --- /dev/null +++ b/src/beautifulsoup/__init__.py @@ -0,0 +1,368 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses a (possibly invalid) XML or HTML document into a +tree representation. It provides methods and Pythonic idioms that make +it easy to navigate, search, and modify the tree. + +A well-formed XML/HTML document yields a well-formed data +structure. An ill-formed XML/HTML document yields a correspondingly +ill-formed data structure. If your document is only locally +well-formed, you can use this library to find and process the +well-formed part of it. + +Beautiful Soup works with Python 2.2 and up. It has no external +dependencies, but you'll have more success at converting data to UTF-8 +if you also install these three packages: + +* chardet, for auto-detecting character encodings + http://chardet.feedparser.org/ +* cjkcodecs and iconv_codec, which add more encodings to the ones supported + by stock Python. + http://cjkpython.i18n.org/ + +Beautiful Soup defines classes for two main parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. This class has web browser-like heuristics for + obtaining a sensible parse tree in the face of common HTML errors. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/documentation.html + +Here, have some legalese: + +Copyright (c) 2004-2009, Leonard Richardson + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the the Beautiful Soup Consortium and All + Night Kosher Bakery nor the names of its contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. + +""" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.0.0" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" +__license__ = "New-style BSD" + +__all__ = ['BeautifulSoup', + + # Stuff imported from other packages + 'Entities', + 'HTMLParserXMLTreeBuilder', + 'HTMLParserTreeBuilder', + 'HTMLTreeBuilder', + 'XMLTreeBuilder', + + 'BeautifulStoneSoup', + 'ICantBelieveItsBeautifulSoup'] + +import re + +from util import isList, isString, buildSet +from dammit import UnicodeDammit +from builder import ( + HTMLParserXMLTreeBuilder, HTMLParserTreeBuilder, HTMLTreeBuilder, + ICantBelieveItsValidHTMLTreeBuilder, XMLTreeBuilder) +from element import Entities, NavigableString, Tag + + +class BeautifulStoneSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter a self-closing tag, call handle_starttag and then + handle_endtag, but note that the tag will not be displayed as a + self-closing tag unless you also have your builder's + isSelfClosingTag() implementation return True when passed the tag + name. + """ + ROOT_TAG_NAME = u'[document]' + + # Used to detect the charset in a META tag; see handleSpecialMetaTag + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + + def _defaultBuilder(self): + return HTMLParserXMLTreeBuilder() + + def __init__(self, markup="", builder=None, parseOnlyThese=None, + fromEncoding=None): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" + + if builder is None: + builder = self._defaultBuilder() + self.builder = builder + self.builder.soup = self + + self.parseOnlyThese = parseOnlyThese + self.fromEncoding = fromEncoding + + self.reset() + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + self.markup = markup + try: + self._feed(isHTML=self.builder.assume_html) + except StopParsing: + pass + self.markup = None # The markup can now be GCed. + self.builder.soup = None + self.builder = None # So can the builder. + + def _feed(self, inDocumentEncoding=None, isHTML=False): + # Convert the document to Unicode. + markup = self.markup + if isinstance(markup, unicode): + if not hasattr(self, 'originalEncoding'): + self.originalEncoding = None + else: + dammit = UnicodeDammit\ + (markup, [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.builder.smartQuotesTo, isHTML=isHTML) + markup = dammit.unicode + self.originalEncoding = dammit.originalEncoding + self.declaredHTMLEncoding = dammit.declaredHTMLEncoding + self.builder.reset() + + self.builder.feed(markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableString): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not buildSet([tag.name for tag in self.tagStack]).intersection( + self.builder.preserve_whitespace_tags)): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parseOnlyThese and len(self.tagStack) <= 1 and \ + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): + return + o = containerClass(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def handle_starttag(self, name, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occured + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + #print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parseOnlyThese and len(self.tagStack) <= 1 + and (self.parseOnlyThese.text + or not self.parseOnlyThese.searchTag(name, attrs))): + return None + + containsSubstitutions = False + if name == 'meta' and self.builder.assume_html: + containsSubstitutions = self.handleSpecialMetaTag(attrs) + + tag = Tag(self, self.builder, name, attrs, self.currentTag, + self.previous) + tag.containsSubstitutions = containsSubstitutions + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name): + #print "End tag: " + name + self.endData() + self._popToTag(name) + + def handle_data(self, data): + self.currentData.append(data) + + def handleSpecialMetaTag(self, attrs): + """Beautiful Soup can detect a charset included in a META tag, + try to convert the document to that charset, and re-parse the + document from the beginning.""" + httpEquiv = None + contentType = None + contentTypeIndex = None + tagNeedsEncodingSubstitution = False + + for i in range(0, len(attrs)): + key, value = attrs[i] + key = key.lower() + if key == 'http-equiv': + httpEquiv = value + elif key == 'content': + contentType = value + contentTypeIndex = i + + if httpEquiv and contentType: # It's an interesting meta tag. + match = self.CHARSET_RE.search(contentType) + if match: + if (self.declaredHTMLEncoding is not None or + self.originalEncoding == self.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, contentType) + attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], + newAttr) + tagNeedsEncodingSubstitution = True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + newCharset = match.group(3) + if newCharset and newCharset != self.originalEncoding: + self.declaredHTMLEncoding = newCharset + self._feed(self.declaredHTMLEncoding) + raise StopParsing + pass + return tagNeedsEncodingSubstitution + + +class BeautifulSoup(BeautifulStoneSoup): + """A convenience class for parsing HTML without creating a builder.""" + def _defaultBuilder(self): + return HTMLParserTreeBuilder() + + +class StopParsing(Exception): + pass + + +class ICantBelieveItsBeautifulSoup(BeautifulStoneSoup): + def _defaultBuilder(self): + return ICantBelieveItsValidHTMLBuilder() + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() diff --git a/src/beautifulsoup/builder.py b/src/beautifulsoup/builder.py new file mode 100644 index 0000000..aeae1e8 --- /dev/null +++ b/src/beautifulsoup/builder.py @@ -0,0 +1,484 @@ +import markupbase +import re +from util import buildSet, isList, isString +# element has taken care of import weirdness, so import name2codepoint +# from there to avoid duplicating the weirdness. +from element import name2codepoint +from element import ( + CData, Comment, Declaration, Entities, ProcessingInstruction) +from HTMLParser import HTMLParser, HTMLParseError + +__all__ = ['TreeBuilder', + 'HTMLParserXMLTreeBuilder', + 'HTMLParserTreeBuilder', + 'XMLTreeBuilder', + 'HTMLTreeBuilder', + 'ICantBelieveItsValidHTMLTreeBuilder'] + +#This hack makes the HTMLParser-based tree builders able to parse XML +#with namespaces. +markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the nestable_tags and reset_nesting_tags maps out of + lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion) and not isString(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + + +class TreeBuilder(Entities): + + smartQuotesTo = Entities.XML_ENTITIES + preserve_whitespace_tags = buildSet() + quote_tags = buildSet() + self_closing_tags = buildSet() + assume_html = False + + def __init__(self): + self.soup = None + + def isSelfClosingTag(self, name): + return name in self.self_closing_tags + + def reset(self): + pass + + def feed(self): + pass + + +class HTMLParserXMLTreeBuilder(HTMLParser, TreeBuilder): + + """ + This class defines a basic tree builder based on Python's built-in + HTMLParser. The tree builder knows nothing about tag + behavior except for the following: + + You can't close a tag without closing all the tags it encloses. + That is, "<foo><bar></foo>" actually means + "<foo><bar></bar></foo>". + + [Another possible explanation is "<foo><bar /></foo>", but unless + you specify 'bar' in self_closing_tags, this class will never use + that explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting. + + + HTMLParser will process most bad HTML, and the BeautifulSoup class + has some tricks for dealing with some HTML that kills HTMLParser, + but Beautiful Soup can nonetheless choke or lose data if your data + uses self-closing tags or declarations incorrectly. + + This class uses regexes to sanitize input, avoiding the vast + majority of these problems. If the problems don't apply to you, + pass in False for markupMassage, and you'll get better + performance. + + The default parser massage techniques fix the two most common + instances of invalid HTML that choke HTMLParser: + + <br/> (No space between name of closing tag and tag close) + <! --Comment--> (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get HTMLParserXMLTreeBuilder to scrub your input the way you + want. + """ + reset_nesting_tags = {} + nestable_tags = {} + + MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda x: x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda x: '<!' + x.group(1) + '>') + ] + + def __init__(self, convertEntities=None, markupMassage=True, + selfClosingTags=None, + smartQuotesTo=Entities.XML_ENTITIES): + HTMLParser.__init__(self) + self.soup = None + self.convertEntities = convertEntities + self.instanceSelfClosingTags = buildSet(selfClosingTags or []) + self.markupMassage = markupMassage + self.smartQuotesTo = smartQuotesTo + self.quoteStack = [] + + # Set the rules for how we'll deal with the entities we + # encounter + if self.convertEntities: + # It doesn't make sense to convert encoded characters to + # entities even while you're converting entities to Unicode. + # Just convert it all to Unicode. + self.smartQuotesTo = None + if convertEntities == self.HTML_ENTITIES: + self.convertXMLEntities = False + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = True + elif convertEntities == self.XHTML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = False + elif convertEntities == self.XML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + else: + self.convertXMLEntities = False + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + + def feed(self, markup): + if markup is not None: + if self.markupMassage: + if not isList(self.markupMassage): + self.markupMassage = self.MARKUP_MASSAGE + for fix, m in self.markupMassage: + markup = fix.sub(m, markup) + # TODO: We get rid of markupMassage so that the + # soup object can be deepcopied later on. Some + # Python installations can't copy regexes. If anyone + # was relying on the existence of markupMassage, this + # might cause problems. + # XXX: This might not be necessary now that we've moved + # the massage code into the builder. + #del(self.markupMassage) + HTMLParser.feed(self, markup) + + def isSelfClosingTag(self, name): + """Returns true iff the given string is the name of a + self-closing tag according to this parser.""" + return (name in self.self_closing_tags + or name in self.instanceSelfClosingTags) + + def handle_starttag(self, name, attrs): + if len(self.quoteStack) > 0: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + if not self.isSelfClosingTag(name): + self.soup.endData() + self._smartPop(name) + tag = self.soup.handle_starttag(name, attrs) + if tag is None: + # The tag was filtered out by the SoupStrainer + return + if name in self.quote_tags: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + if self.isSelfClosingTag(name): + self.soup.handle_endtag(name) + + def handle_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print "</%s> is not real!" % name + self.handle_data('</%s>' % name) + return + self.soup.handle_endtag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, content): + #print "Handling data " + content + self.soup.handle_data(content) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.convertEntities: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML and/or XML entity references to the corresponding Unicode + characters.""" + data = None + if self.convertHTMLEntities: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + + if not data and self.convertXMLEntities: + data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.convertHTMLEntities and \ + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref + if not data: + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.handle_data(text) + self.soup.endData(subclass) + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: + <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. + <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. + + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' + <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.nestable_tags.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.reset_nesting_tags.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.soup.tagStack)-1, 0, -1): + p = self.soup.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.reset_nesting_tags.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self.soup._popToTag(popTo, inclusive) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = HTMLParser.parse_declaration(self, i) + except HTMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + + +class HTMLParserTreeBuilder(HTMLParserXMLTreeBuilder): + """This builder knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a <p> tag should implicitly close the previous <p> tag. + + <p>Para1<p>Para2 + should be transformed into: + <p>Para1</p><p>Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a <blockquote> tag should _not_ implicitly close the previous + <blockquote> tag. + + Alice said: <blockquote>Bob said: <blockquote>Blah + should NOT be transformed into: + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a <tr> tag should + implicitly close the previous <tr> tag within the same <table>, + but not close a <tr> tag in another table. + + <table><tr>Blah<tr>Blah + should be transformed into: + <table><tr>Blah</tr><tr>Blah + but, + <tr>Blah<table><tr>Blah + should NOT be transformed into + <tr>Blah<table></tr><tr>Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try subclassing this tree builder or using another parser's tree + builder.""" + + assume_html = True + preserve_whitespace_tags = buildSet(['pre', 'textarea']) + quote_tags = buildSet(['script', 'textarea']) + self_closing_tags = buildSet(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + nestable_list_tags = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + nestable_table_tags = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + 'thead' : ['table'], + 'tbody' : ['table'], + 'tfoot' : ['table'], + } + + non_nestable_block_tags = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + reset_nesting_tags = buildTagMap(None, nestable_block_tags, 'noscript', + non_nestable_block_tags, + nestable_list_tags, + nestable_table_tags) + + nestable_tags = buildTagMap([], nestable_inline_tags, nestable_block_tags, + nestable_list_tags, nestable_table_tags) + + + def __init__(self, *args, **kwargs): + if not kwargs.has_key('smartQuotesTo'): + kwargs['smartQuotesTo'] = self.HTML_ENTITIES + HTMLParserXMLTreeBuilder.__init__(self, *args, **kwargs) + +# Some aliases to use if you don't care about the underlying +# implementation. +XMLTreeBuilder = HTMLParserXMLTreeBuilder +HTMLTreeBuilder = HTMLParserTreeBuilder + +class ICantBelieveItsValidHTMLTreeBuilder(HTMLParserTreeBuilder): + """The is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + <b>Foo<b>Bar</b></b> + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '</b></b>' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close a 'b' tag + than to actually use nested 'b' tags, and the BeautifulSoup class + handles the common case. This class handles the not-co-common + case: where you can't believe someone wrote what they did, but + it's valid HTML and BeautifulSoup screwed up by assuming it + wouldn't be.""" + i_cant_believe_theyre_nestable_inline_tags = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + i_cant_believe_theyre_nestable_block_tags = ['noscript'] + + nestable_tags = buildTagMap([], HTMLParserTreeBuilder.nestable_tags, + i_cant_believe_theyre_nestable_block_tags, + i_cant_believe_theyre_nestable_inline_tags) diff --git a/src/beautifulsoup/builder.py.3.diff b/src/beautifulsoup/builder.py.3.diff new file mode 100644 index 0000000..91b510d --- /dev/null +++ b/src/beautifulsoup/builder.py.3.diff @@ -0,0 +1,4 @@ +90c90 +< from HTMLParser import HTMLParser, HTMLParseError +--- +> from html.parser import HTMLParser, HTMLParseError diff --git a/src/beautifulsoup/dammit.py b/src/beautifulsoup/dammit.py new file mode 100644 index 0000000..78bd4b2 --- /dev/null +++ b/src/beautifulsoup/dammit.py @@ -0,0 +1,292 @@ +"""Beautiful Soup bonus library: Unicode, Dammit + +This class forces XML data into a standard format (usually to UTF-8 or +Unicode). It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It does not rewrite the XML or HTML to reflect a new +encoding; that's Beautiful Soup's job. +""" + +import codecs +import re +import types + +# Autodetects character encodings. +# Download from http://chardet.feedparser.org/ +try: + import chardet +# import chardet.constants +# chardet.constants._debug = 1 +except ImportError: + chardet = None + +# cjkcodecs and iconv_codec make Python know about more character encodings. +# Both are available from http://cjkpython.i18n.org/ +# They're built in if you use Python 2.4. +try: + import cjkcodecs.aliases +except ImportError: + pass +try: + import iconv_codec +except ImportError: + pass + + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = { "macintosh" : "mac-roman", + "x-sjis" : "shift-jis" } + + def __init__(self, markup, overrideEncodings=[], + smartQuotesTo='xml', isHTML=False): + self.declaredHTMLEncoding = None + self.markup, documentEncoding, sniffedEncoding = \ + self._detectEncoding(markup, isHTML) + self.smartQuotesTo = smartQuotesTo + self.triedEncodings = [] + if markup == '' or isinstance(markup, unicode): + self.originalEncoding = None + self.unicode = unicode(markup) + return + + u = None + for proposedEncoding in overrideEncodings: + u = self._convertFrom(proposedEncoding) + if u: break + if not u: + for proposedEncoding in (documentEncoding, sniffedEncoding): + u = self._convertFrom(proposedEncoding) + if u: break + + # If no luck and we have auto-detection library, try that: + if not u and chardet and not isinstance(self.markup, unicode): + u = self._convertFrom(chardet.detect(self.markup)['encoding']) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convertFrom(proposed_encoding) + if u: break + + self.unicode = u + if not u: self.originalEncoding = None + + def _subMSChar(self, match): + """Changes a MS smart quote character to an XML or HTML + entity.""" + orig = match.group(1) + sub = self.MS_CHARS.get(orig) + if type(sub) == types.TupleType: + if self.smartQuotesTo == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convertFrom(self, proposed): + proposed = self.find_codec(proposed) + if not proposed or proposed in self.triedEncodings: + return None + self.triedEncodings.append(proposed) + markup = self.markup + + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if self.smartQuotesTo and proposed.lower() in("windows-1252", + "iso-8859-1", + "iso-8859-2"): + smart_quotes_re = "([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._subMSChar, markup) + + try: + # print "Trying to convert document to %s" % proposed + u = self._toUnicode(markup, proposed) + self.markup = u + self.originalEncoding = proposed + except Exception, e: + # print "That didn't work!" + # print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _toUnicode(self, data, encoding): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding) + return newdata + + def _detectEncoding(self, xml_data, isHTML=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == '\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == '\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ + and (xml_data[2:4] != '\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ + (xml_data[2:4] != '\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == '\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() + xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) + if not xml_encoding_match and isHTML: + meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() + regexp = re.compile(meta_re, re.I) + xml_encoding_match = regexp.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if isHTML: + self.declaredHTMLEncoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ + ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + MS_CHARS = { '\x80' : ('euro', '20AC'), + '\x81' : ' ', + '\x82' : ('sbquo', '201A'), + '\x83' : ('fnof', '192'), + '\x84' : ('bdquo', '201E'), + '\x85' : ('hellip', '2026'), + '\x86' : ('dagger', '2020'), + '\x87' : ('Dagger', '2021'), + '\x88' : ('circ', '2C6'), + '\x89' : ('permil', '2030'), + '\x8A' : ('Scaron', '160'), + '\x8B' : ('lsaquo', '2039'), + '\x8C' : ('OElig', '152'), + '\x8D' : '?', + '\x8E' : ('#x17D', '17D'), + '\x8F' : '?', + '\x90' : '?', + '\x91' : ('lsquo', '2018'), + '\x92' : ('rsquo', '2019'), + '\x93' : ('ldquo', '201C'), + '\x94' : ('rdquo', '201D'), + '\x95' : ('bull', '2022'), + '\x96' : ('ndash', '2013'), + '\x97' : ('mdash', '2014'), + '\x98' : ('tilde', '2DC'), + '\x99' : ('trade', '2122'), + '\x9a' : ('scaron', '161'), + '\x9b' : ('rsaquo', '203A'), + '\x9c' : ('oelig', '153'), + '\x9d' : '?', + '\x9e' : ('#x17E', '17E'), + '\x9f' : ('Yuml', ''),} diff --git a/src/beautifulsoup/dammit.py.3.diff b/src/beautifulsoup/dammit.py.3.diff new file mode 100644 index 0000000..f6bab68 --- /dev/null +++ b/src/beautifulsoup/dammit.py.3.diff @@ -0,0 +1,70 @@ +1800c1800 +< smart_quotes_re = "([\x80-\x9f])" +--- +> smart_quotes_re = b"([\x80-\x9f])" +1952,1983c1952,1983 +< MS_CHARS = { '\x80' : ('euro', '20AC'), +< '\x81' : ' ', +< '\x82' : ('sbquo', '201A'), +< '\x83' : ('fnof', '192'), +< '\x84' : ('bdquo', '201E'), +< '\x85' : ('hellip', '2026'), +< '\x86' : ('dagger', '2020'), +< '\x87' : ('Dagger', '2021'), +< '\x88' : ('circ', '2C6'), +< '\x89' : ('permil', '2030'), +< '\x8A' : ('Scaron', '160'), +< '\x8B' : ('lsaquo', '2039'), +< '\x8C' : ('OElig', '152'), +< '\x8D' : '?', +< '\x8E' : ('#x17D', '17D'), +< '\x8F' : '?', +< '\x90' : '?', +< '\x91' : ('lsquo', '2018'), +< '\x92' : ('rsquo', '2019'), +< '\x93' : ('ldquo', '201C'), +< '\x94' : ('rdquo', '201D'), +< '\x95' : ('bull', '2022'), +< '\x96' : ('ndash', '2013'), +< '\x97' : ('mdash', '2014'), +< '\x98' : ('tilde', '2DC'), +< '\x99' : ('trade', '2122'), +< '\x9a' : ('scaron', '161'), +< '\x9b' : ('rsaquo', '203A'), +< '\x9c' : ('oelig', '153'), +< '\x9d' : '?', +< '\x9e' : ('#x17E', '17E'), +< '\x9f' : ('Yuml', ''),} +--- +> MS_CHARS = { b'\x80' : ('euro', '20AC'), +> b'\x81' : ' ', +> b'\x82' : ('sbquo', '201A'), +> b'\x83' : ('fnof', '192'), +> b'\x84' : ('bdquo', '201E'), +> b'\x85' : ('hellip', '2026'), +> b'\x86' : ('dagger', '2020'), +> b'\x87' : ('Dagger', '2021'), +> b'\x88' : ('circ', '2C6'), +> b'\x89' : ('permil', '2030'), +> b'\x8A' : ('Scaron', '160'), +> b'\x8B' : ('lsaquo', '2039'), +> b'\x8C' : ('OElig', '152'), +> b'\x8D' : '?', +> b'\x8E' : ('#x17D', '17D'), +> b'\x8F' : '?', +> b'\x90' : '?', +> b'\x91' : ('lsquo', '2018'), +> b'\x92' : ('rsquo', '2019'), +> b'\x93' : ('ldquo', '201C'), +> b'\x94' : ('rdquo', '201D'), +> b'\x95' : ('bull', '2022'), +> b'\x96' : ('ndash', '2013'), +> b'\x97' : ('mdash', '2014'), +> b'\x98' : ('tilde', '2DC'), +> b'\x99' : ('trade', '2122'), +> b'\x9a' : ('scaron', '161'), +> b'\x9b' : ('rsaquo', '203A'), +> b'\x9c' : ('oelig', '153'), +> b'\x9d' : '?', +> b'\x9e' : ('#x17E', '17E'), +> b'\x9f' : ('Yuml', ''),} diff --git a/src/beautifulsoup/docs/__init__.py b/src/beautifulsoup/docs/__init__.py new file mode 100644 index 0000000..93d8446 --- /dev/null +++ b/src/beautifulsoup/docs/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2009 Canonical Ltd. All rights reserved. +# +# This file is part of beautifulsoup +# +# beautifulsoup is free software: you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# beautifulsoup is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with beautifulsoup. If not, see <http://www.gnu.org/licenses/>. +"""Executable documentation about beautifulsoup.""" diff --git a/src/beautifulsoup/element.py b/src/beautifulsoup/element.py new file mode 100644 index 0000000..3bdb01f --- /dev/null +++ b/src/beautifulsoup/element.py @@ -0,0 +1,870 @@ +import re +import types +try: + from htmlentitydefs import name2codepoint +except ImportError: + name2codepoint = {} + +from util import isString, isList + +DEFAULT_OUTPUT_ENCODING = "utf-8" + +class Entities: + """A mixin class that knows about XML entities.""" + + HTML_ENTITIES = "html" + XML_ENTITIES = "xml" + XHTML_ENTITIES = "xhtml" + + def _invert(h): + "Cheap function to invert a hash." + i = {} + for k,v in h.items(): + i[v] = k + return i + + XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", + "quot" : '"', + "amp" : "&", + "lt" : "<", + "gt" : ">" } + + XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = None + self.previousSibling = None + self.nextSibling = None + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def replaceWith(self, replaceWith): + oldParent = self.parent + myIndex = self.parent.contents.index(self) + if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: + # We're replacing this element with one of its siblings. + index = self.parent.contents.index(replaceWith) + if index and index < myIndex: + # Furthermore, it comes before this element. That + # means that when we extract it, the index of this + # element will change. + myIndex = myIndex - 1 + self.extract() + oldParent.insert(myIndex, replaceWith) + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent: + try: + self.parent.contents.remove(self) + except ValueError: + pass + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + lastChild = self._lastRecursiveChild() + nextElement = lastChild.next + + if self.previous: + self.previous.next = nextElement + if nextElement: + nextElement.previous = self.previous + self.previous = None + lastChild.next = None + + self.parent = None + if self.previousSibling: + self.previousSibling.nextSibling = self.nextSibling + if self.nextSibling: + self.nextSibling.previousSibling = self.previousSibling + self.previousSibling = self.nextSibling = None + return self + + def _lastRecursiveChild(self): + "Finds the last element beneath this object to be parsed." + lastChild = self + while hasattr(lastChild, 'contents') and lastChild.contents: + lastChild = lastChild.contents[-1] + return lastChild + + def insert(self, position, newChild): + if (isinstance(newChild, basestring) + or isinstance(newChild, unicode)) \ + and not isinstance(newChild, NavigableString): + newChild = NavigableString(newChild) + + position = min(position, len(self.contents)) + if hasattr(newChild, 'parent') and newChild.parent != None: + # We're 'inserting' an element that's already one + # of this object's children. + if newChild.parent == self: + index = self.find(newChild) + if index and index < position: + # Furthermore we're moving it further down the + # list of this object's children. That means that + # when we extract this element, our target index + # will jump down one. + position = position - 1 + newChild.extract() + + newChild.parent = self + previousChild = None + if position == 0: + newChild.previousSibling = None + newChild.previous = self + else: + previousChild = self.contents[position-1] + newChild.previousSibling = previousChild + newChild.previousSibling.nextSibling = newChild + newChild.previous = previousChild._lastRecursiveChild() + if newChild.previous: + newChild.previous.next = newChild + + newChildsLastElement = newChild._lastRecursiveChild() + + if position >= len(self.contents): + newChild.nextSibling = None + + parent = self + parentsNextSibling = None + while not parentsNextSibling: + parentsNextSibling = parent.nextSibling + parent = parent.parent + if not parent: # This is the last element in the document. + break + if parentsNextSibling: + newChildsLastElement.next = parentsNextSibling + else: + newChildsLastElement.next = None + else: + nextChild = self.contents[position] + newChild.nextSibling = nextChild + if newChild.nextSibling: + newChild.nextSibling.previousSibling = newChild + newChildsLastElement.next = nextChild + + if newChildsLastElement.next: + newChildsLastElement.next.previous = newChildsLastElement + self.contents.insert(position, newChild) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def findNext(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._findOne(self.findAllNext, name, attrs, text, **kwargs) + + def findAllNext(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.nextGenerator, + **kwargs) + + def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._findOne(self.findNextSiblings, name, attrs, text, + **kwargs) + + def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.nextSiblingGenerator, **kwargs) + fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + + def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) + + def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.previousGenerator, + **kwargs) + fetchPrevious = findAllPrevious # Compatibility with pre-3.x + + def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._findOne(self.findPreviousSiblings, name, attrs, text, + **kwargs) + + def findPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.previousSiblingGenerator, **kwargs) + fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + + def findParent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _findOne because findParents takes a different + # set of arguments. + r = None + l = self.findParents(name, attrs, 1) + if l: + r = l[0] + return r + + def findParents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._findAll(name, attrs, None, limit, self.parentGenerator, + **kwargs) + fetchParents = findParents # Compatibility with pre-3.x + + #These methods do the real heavy lifting. + + def _findOne(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _findAll(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + else: + # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) + results = ResultSet(strainer) + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These Generators can be used to navigate starting from both + #NavigableStrings and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + # Utility methods + def substituteEncoding(self, str, encoding=None): + encoding = encoding or "utf-8" + return str.replace("%SOUP-ENCODING%", encoding) + + def toEncoding(self, s, encoding=None): + """Encodes an object to a string in some encoding, or to Unicode. + .""" + if isinstance(s, unicode): + if encoding: + s = s.encode(encoding) + elif isinstance(s, str): + if encoding: + s = s.encode(encoding) + else: + s = unicode(s) + else: + if encoding: + s = self.toEncoding(str(s), encoding) + else: + s = unicode(s) + return s + +class NavigableString(unicode, PageElement): + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.decode().encode(encoding) + + def decodeGivenEventualEncoding(self, eventualEncoding): + return self + +class CData(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<![CDATA[' + self + u']]>' + +class ProcessingInstruction(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + output = self + if u'%SOUP-ENCODING%' in output: + output = self.substituteEncoding(output, eventualEncoding) + return u'<?' + output + u'?>' + +class Comment(NavigableString): + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!--' + self + u'-->' + +class Declaration(NavigableString): + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!' + self + u'>' + +class Tag(PageElement, Entities): + + """Represents a found HTML tag with its attributes and contents.""" + + def _convertEntities(self, builder, match): + """Used in a call to re.sub to replace HTML, XML, and numeric + entities with the appropriate Unicode characters. If HTML + entities are being converted, any unrecognized entities are + escaped.""" + x = match.group(1) + if builder.convertHTMLEntities and x in name2codepoint: + return unichr(name2codepoint[x]) + elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: + if builder.convertXMLEntities: + return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] + else: + return u'&%s;' % x + elif len(x) > 0 and x[0] == '#': + # Handle numeric entities + if len(x) > 1 and x[1] == 'x': + return unichr(int(x[2:], 16)) + else: + return unichr(int(x[1:])) + + elif self.escapeUnrecognizedEntities: + return u'&%s;' % x + else: + return u'&%s;' % x + + def __init__(self, parser, builder, name, attrs=None, parent=None, + previous=None): + "Basic constructor." + + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected + self.parserClass = parser.__class__ + self.name = name + self.isSelfClosing = builder.isSelfClosingTag(name) + if attrs == None: + attrs = [] + if isinstance(attrs, types.DictType): + self.attrMap = attrs + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + self.containsSubstitutions = False + self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities + + # Convert any HTML, XML, or numeric entities in the attribute values. + convert_one = lambda x: self._convertEntities(parser.builder, x) + def convert(kval): + k, val = kval + if val is None: + return kval + return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", convert_one, val)) + if isinstance(attrs, types.DictType): + self.attrs = [convert(kv) for kv in attrs.items()] + else: + self.attrs = map(convert, attrs) + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def has_key(self, key): + return self._getAttrMap().has_key(key) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + findAll() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.findAll, args, kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.find(tag[:-3]) + elif tag.find('__') != 0: + return self.find(tag) + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.decode(eventualEncoding=encoding) + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + + ")") + + def _sub_entity(self, x): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + + def __unicode__(self): + return self.decode() + + def __str__(self): + return self.encode() + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) + + def decode(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + fmt = '%s="%s"' + if isString(val): + if (self.containsSubstitutions + and eventualEncoding is not None + and '%SOUP-ENCODING%' in val): + val = self.substituteEncoding(val, eventualEncoding) + + # The attribute value either: + # + # * Contains no embedded double quotes or single quotes. + # No problem: we enclose it in double quotes. + # * Contains embedded single quotes. No problem: + # double quotes work here too. + # * Contains embedded double quotes. No problem: + # we enclose it in single quotes. + # * Embeds both single _and_ double quotes. This + # can't happen naturally, but it can happen if + # you modify an attribute value after parsing + # the document. Now we have a bit of a + # problem. We solve it by enclosing the + # attribute in single quotes, and escaping any + # embedded single quotes to XML entities. + if '"' in val: + fmt = "%s='%s'" + if "'" in val: + # TODO: replace with apos when + # appropriate. + val = val.replace("'", "&squot;") + + # Now we're okay w/r/t quotes. But the attribute + # value might also contain angle brackets, or + # ampersands that aren't part of entities. We need + # to escape those to XML entities too. + val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) + if val is None: + # Handle boolean attributes. + decoded = key + else: + decoded = fmt % (key, val) + attrs.append(decoded) + close = '' + closeTag = '' + if self.isSelfClosing: + close = ' /' + else: + closeTag = '</%s>' % self.name + + indentTag, indentContents = 0, 0 + if prettyPrint: + indentTag = indentLevel + space = (' ' * (indentTag-1)) + indentContents = indentTag + 1 + contents = self.decodeContents(prettyPrint, indentContents, + eventualEncoding) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if prettyPrint: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + if prettyPrint: + s.append("\n") + s.append(contents) + if prettyPrint and contents and contents[-1] != "\n": + s.append("\n") + if prettyPrint and closeTag: + s.append(space) + s.append(closeTag) + if prettyPrint and closeTag and self.nextSibling: + s.append("\n") + s = ''.join(s) + return s + + def decompose(self): + """Recursively destroys the contents of this tree.""" + contents = [i for i in self.contents] + for i in contents: + if isinstance(i, Tag): + i.decompose() + else: + i.extract() + self.extract() + + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.encode(encoding, True) + + def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decodeContents(prettyPrint, indentLevel).encode(encoding) + + def decodeContents(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Renders the contents of this tag as a string in the given + encoding. If encoding is None, returns a Unicode string..""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.decodeGivenEventualEncoding(eventualEncoding) + elif isinstance(c, Tag): + s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) + if text and prettyPrint: + text = text.strip() + if text: + if prettyPrint: + s.append(" " * (indentLevel-1)) + s.append(text) + if prettyPrint: + s.append("\n") + return ''.join(s) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.findAll(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def findAll(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._findAll(name, attrs, text, limit, generator, **kwargs) + findChildren = findAll + + # Pre-3.x compatibility methods. Will go away in 4.0. + first = find + fetch = findAll + + def fetchText(self, text=None, recursive=True, limit=None): + return self.findAll(text=text, recursive=recursive, limit=limit) + + def firstText(self, text=None, recursive=True): + return self.find(text=text, recursive=recursive) + + # 3.x compatibility methods. Will go away in 4.0. + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if encoding is None: + return self.decodeContents(prettyPrint, indentLevel, encoding) + else: + return self.encodeContents(encoding, prettyPrint, indentLevel) + + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + if not len(self.contents): + raise StopIteration + stopNode = self._lastRecursiveChild().next + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next + +# Next, a couple classes to represent queries and their results. +class SoupStrainer: + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = name + if isString(attrs): + kwargs['class'] = attrs + attrs = None + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + self.attrs = attrs + self.text = text + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def searchTag(self, markupName=None, markupAttrs={}): + found = None + markup = None + if isinstance(markupName, Tag): + markup = markupName + markupAttrs = markup + callFunctionWithTagData = callable(self.name) \ + and not isinstance(markupName, Tag) + + if (not self.name) \ + or callFunctionWithTagData \ + or (markup and self._matches(markup, self.name)) \ + or (not markup and self._matches(markupName, self.name)): + if callFunctionWithTagData: + match = self.name(markupName, markupAttrs) + else: + match = True + markupAttrMap = None + for attr, matchAgainst in self.attrs.items(): + if not markupAttrMap: + if hasattr(markupAttrs, 'get'): + markupAttrMap = markupAttrs + else: + markupAttrMap = {} + for k,v in markupAttrs: + markupAttrMap[k] = v + attrValue = markupAttrMap.get(attr) + if not self._matches(attrValue, matchAgainst): + match = False + break + if match: + if markup: + found = markup + else: + found = markupName + return found + + def search(self, markup): + #print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if isList(markup) and not isinstance(markup, Tag): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text: + found = self.searchTag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isString(markup): + if self._matches(markup, self.text): + found = markup + else: + raise Exception, "I don't know how to match against a %s" \ + % markup.__class__ + return found + + def _matches(self, markup, matchAgainst): + #print "Matching %s against %s" % (markup, matchAgainst) + result = False + if matchAgainst == True and type(matchAgainst) == types.BooleanType: + result = markup != None + elif callable(matchAgainst): + result = matchAgainst(markup) + else: + #Custom match methods take the tag as an argument, but all + #other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + if markup is not None and not isString(markup): + markup = unicode(markup) + #Now we know that chunk is either a string, or None. + if hasattr(matchAgainst, 'match'): + # It's a regexp object. + result = markup and matchAgainst.search(markup) + elif (isList(matchAgainst) + and (markup is not None or not isString(matchAgainst))): + result = markup in matchAgainst + elif hasattr(matchAgainst, 'items'): + result = markup.has_key(matchAgainst) + elif matchAgainst and isString(markup): + if isinstance(markup, unicode): + matchAgainst = unicode(matchAgainst) + else: + matchAgainst = str(matchAgainst) + + if not result: + result = matchAgainst == markup + return result + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source): + list.__init__([]) + self.source = source diff --git a/src/beautifulsoup/element.py.3.diff b/src/beautifulsoup/element.py.3.diff new file mode 100644 index 0000000..4549edd --- /dev/null +++ b/src/beautifulsoup/element.py.3.diff @@ -0,0 +1,8 @@ +92c92 +< from htmlentitydefs import name2codepoint +--- +> from html.entities import name2codepoint +337c337 +< i = g.next() +--- +> i = g.__next__() diff --git a/src/beautifulsoup/tests/__init__.py b/src/beautifulsoup/tests/__init__.py new file mode 100644 index 0000000..71c3f97 --- /dev/null +++ b/src/beautifulsoup/tests/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2009 Canonical Ltd. All rights reserved. +# +# This file is part of beautifulsoup +# +# beautifulsoup is free software: you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# beautifulsoup is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with beautifulsoup. If not, see <http://www.gnu.org/licenses/>. +"The beautifulsoup tests." diff --git a/src/beautifulsoup/tests/test_docs.py b/src/beautifulsoup/tests/test_docs.py new file mode 100644 index 0000000..51a53f7 --- /dev/null +++ b/src/beautifulsoup/tests/test_docs.py @@ -0,0 +1,51 @@ +# Copyright 2009 Canonical Ltd. All rights reserved. +# +# This file is part of beautifulsoup +# +# beautifulsoup is free software: you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# beautifulsoup is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with beautifulsoup. If not, see <http://www.gnu.org/licenses/>. +"Test harness for doctests." + +# pylint: disable-msg=E0611,W0142 + +__metaclass__ = type +__all__ = [ + 'additional_tests', + ] + +import atexit +import doctest +import os +from pkg_resources import ( + resource_filename, resource_exists, resource_listdir, cleanup_resources) +import unittest + +DOCTEST_FLAGS = ( + doctest.ELLIPSIS | + doctest.NORMALIZE_WHITESPACE | + doctest.REPORT_NDIFF) + + +def additional_tests(): + "Run the doc tests (README.txt and docs/*, if any exist)" + doctest_files = [ + os.path.abspath(resource_filename('beautifulsoup', 'README.txt'))] + if resource_exists('beautifulsoup', 'docs'): + for name in resource_listdir('beautifulsoup', 'docs'): + if name.endswith('.txt'): + doctest_files.append( + os.path.abspath( + resource_filename('beautifulsoup', 'docs/%s' % name))) + kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) + atexit.register(cleanup_resources) + return unittest.TestSuite(( + doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/src/beautifulsoup/tests/test_docs.py.3.diff b/src/beautifulsoup/tests/test_docs.py.3.diff new file mode 100644 index 0000000..fc9636c --- /dev/null +++ b/src/beautifulsoup/tests/test_docs.py.3.diff @@ -0,0 +1,122 @@ +433c433 +< self.assertTrue('attr' in BeautifulSoup(text).foo) +--- +> self.assertTrue(BeautifulSoup(text).foo.has_key('attr')) +622c622 +< self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', +--- +> self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', +624c624 +< self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', +--- +> self.assertSoupEquals('<x t="xñ">', b'<x t="x\xc3\xb1"></x>', +671c671 +< markup = "<foo>\x92</foo>" +--- +> markup = b"<foo>\x92</foo>" +675c675 +< hebrew = "\xed\xe5\xec\xf9" +--- +> hebrew = b"\xed\xe5\xec\xf9" +687c687 +< self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') +--- +> self.assertEquals(utf8, b'<foo>\xc3\xbc</foo>') +714,715c714,715 +< euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' +< utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" +--- +> euc_jp = b'<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' +> utf8 = b"<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" +726c726 +< old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" +--- +> old_text = b"<?xml encoding='windows-1252'><foo>\x92</foo>" +731c731 +< no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' +--- +> no_shift_jis_html = b'''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' +741,742c741,742 +< meta_tag = ('<meta content="text/html; charset=x-sjis" ' +< 'http-equiv="Content-type" />') +--- +> meta_tag = (b'<meta content="text/html; charset=x-sjis" ' +> b'http-equiv="Content-type" />') +744,750c744,750 +< '<html><head>\n%s\n' +< '<meta http-equiv="Content-language" content="ja" />' +< '</head><body><pre>\n' +< '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' +< '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' +< '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' +< '</pre></body></html>') % meta_tag +--- +> b'<html><head>\n' + meta_tag + b'\n' +> b'<meta http-equiv="Content-language" content="ja" />' +> b'</head><body><pre>\n' +> b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' +> b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' +> b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' +> b'</pre></body></html>') +763c763 +< content_type = str(soup.meta) +--- +> content_type = soup.meta.decode() +768c768 +< index = content_type.find('charset=') +--- +> index = content_type.find(b'charset=') +773,783c773,783 +< '<html><head>\n' +< '<meta content="text/html; charset=utf-8" ' +< 'http-equiv="Content-type" />\n' +< '<meta http-equiv="Content-language" content="ja" />' +< '</head><body><pre>\n' +< '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' +< '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' +< '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' +< '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' +< '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' +< '</pre></body></html>')) +--- +> b'<html><head>\n' +> b'<meta content="text/html; charset=utf-8" ' +> b'http-equiv="Content-type" />\n' +> b'<meta http-equiv="Content-language" content="ja" />' +> b'</head><body><pre>\n' +> b'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' +> b'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' +> b'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' +> b'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' +> b'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' +> b'</pre></body></html>')) +788c788 +< isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" +--- +> isolatin = b"""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" +792c792 +< utf8 = utf8.replace("\xe9", "\xc3\xa9") +--- +> utf8 = utf8.replace(b"\xe9", b"\xc3\xa9") +796,797c796,797 +< iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' +< utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' +--- +> iso_8859_8= b'<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' +> utf8 = b'<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' +802c802 +< self.assertSoupEquals("\x91Foo\x92 <!--blah-->", +--- +> self.assertSoupEquals(b"\x91Foo\x92 <!--blah-->", +806c806 +< smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" +--- +> smartQuotes = b"Il a dit, \x8BSacré bleu!\x9b" +812c812 +< 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') +--- +> b'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') +815c815 +< utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" +--- +> utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" diff --git a/src/beautifulsoup/tests/test_soup.py b/src/beautifulsoup/tests/test_soup.py new file mode 100644 index 0000000..e7e5680 --- /dev/null +++ b/src/beautifulsoup/tests/test_soup.py @@ -0,0 +1,854 @@ +# -*- coding: utf-8 -*- +"""Unit tests for Beautiful Soup. + +These tests make sure the Beautiful Soup works as it should. If you +find a bug in Beautiful Soup, the best way to express it is as a test +case like this that fails.""" + +import re +import unittest +from beautifulsoup import * +from element import CData, Comment, Declaration, SoupStrainer, Tag +from builder import ICantBelieveItsValidHTMLTreeBuilder +from dammit import UnicodeDammit + +class SoupTest(unittest.TestCase): + + def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup, + encoding=None): + """Parse the given text and make sure its string rep is the other + given text.""" + if rep == None: + rep = toParse + obj = c(toParse) + if encoding is None: + rep2 = obj.decode() + else: + rep2 = obj.encode(encoding) + self.assertEqual(rep2, rep) + + +class FollowThatTag(SoupTest): + + "Tests the various ways of fetching tags from a soup." + + def setUp(self): + ml = """ + <a id="x">1</a> + <A id="a">2</a> + <b id="b">3</a> + <b href="foo" id="x">4</a> + <ac width=100>4</ac>""" + self.soup = BeautifulStoneSoup(ml) + + def testFindAllByName(self): + matching = self.soup('a') + self.assertEqual(len(matching), 2) + self.assertEqual(matching[0].name, 'a') + self.assertEqual(matching, self.soup.findAll('a')) + self.assertEqual(matching, self.soup.findAll(SoupStrainer('a'))) + + def testFindAllByAttribute(self): + matching = self.soup.findAll(id='x') + self.assertEqual(len(matching), 2) + self.assertEqual(matching[0].name, 'a') + self.assertEqual(matching[1].name, 'b') + + matching2 = self.soup.findAll(attrs={'id' : 'x'}) + self.assertEqual(matching, matching2) + + strainer = SoupStrainer(attrs={'id' : 'x'}) + self.assertEqual(matching, self.soup.findAll(strainer)) + + self.assertEqual(len(self.soup.findAll(id=None)), 1) + + self.assertEqual(len(self.soup.findAll(width=100)), 1) + self.assertEqual(len(self.soup.findAll(junk=None)), 5) + self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5) + + self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0) + self.assertEqual(len(self.soup.findAll(junk=True)), 0) + + self.assertEqual(len(self.soup.findAll(junk=True)), 0) + self.assertEqual(len(self.soup.findAll(href=True)), 1) + + def testFindallByClass(self): + soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>') + self.assertEqual(soup.find('a', '1').string, "Bar") + + def testFindAllByList(self): + matching = self.soup(['a', 'ac']) + self.assertEqual(len(matching), 3) + + def testFindAllByHash(self): + matching = self.soup({'a' : True, 'b' : True}) + self.assertEqual(len(matching), 4) + + def testFindAllText(self): + soup = BeautifulSoup("<html>\xbb</html>") + self.assertEqual(soup.findAll(text=re.compile('.*')), + [u'\xbb']) + + def testFindAllByRE(self): + import re + r = re.compile('a.*') + self.assertEqual(len(self.soup(r)), 3) + + def testFindAllByMethod(self): + def matchTagWhereIDMatchesName(tag): + return tag.name == tag.get('id') + + matching = self.soup.findAll(matchTagWhereIDMatchesName) + self.assertEqual(len(matching), 2) + self.assertEqual(matching[0].name, 'a') + + def testParents(self): + soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah') + b = soup.b + self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2) + self.assertEquals(b.findParent('ul')['a'], 'b') + + PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">') + + def testNext(self): + soup = self.PROXIMITY_TEST + b = soup.find('b', {'id' : 2}) + self.assertEquals(b.findNext('b')['id'], '3') + self.assertEquals(b.findNext('b')['id'], '3') + self.assertEquals(len(b.findAllNext('b')), 2) + self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1) + + def testPrevious(self): + soup = self.PROXIMITY_TEST + b = soup.find('b', {'id' : 3}) + self.assertEquals(b.findPrevious('b')['id'], '2') + self.assertEquals(b.findPrevious('b')['id'], '2') + self.assertEquals(len(b.findAllPrevious('b')), 2) + self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1) + + + SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">') + + def testNextSibling(self): + soup = self.SIBLING_TEST + tag = 'blockquote' + b = soup.find(tag, {'id' : 2}) + self.assertEquals(b.findNext(tag)['id'], '2.1') + self.assertEquals(b.findNextSibling(tag)['id'], '3') + self.assertEquals(b.findNextSibling(tag)['id'], '3') + self.assertEquals(len(b.findNextSiblings(tag)), 2) + self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1) + + def testPreviousSibling(self): + soup = self.SIBLING_TEST + tag = 'blockquote' + b = soup.find(tag, {'id' : 3}) + self.assertEquals(b.findPrevious(tag)['id'], '2.1') + self.assertEquals(b.findPreviousSibling(tag)['id'], '2') + self.assertEquals(b.findPreviousSibling(tag)['id'], '2') + self.assertEquals(len(b.findPreviousSiblings(tag)), 2) + self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1) + + def testTextNavigation(self): + soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh') + baz = soup.find(text='Baz') + self.assertEquals(baz.findParent("i")['id'], '1') + self.assertEquals(baz.findNext(text='Blee'), 'Blee') + self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee') + self.assertEquals(baz.findNextSibling(text='Blargh'), None) + self.assertEquals(baz.findNextSibling('hr')['id'], '1') + +class SiblingRivalry(SoupTest): + "Tests the nextSibling and previousSibling navigation." + + def testSiblings(self): + soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>") + secondLI = soup.find('li').nextSibling + self.assert_(secondLI.name == 'li' and secondLI.string == '2') + self.assertEquals(soup.find(text='1').nextSibling.name, 'p') + self.assertEquals(soup.find('p').nextSibling, 'B') + self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B') + +class TagsAreObjectsToo(SoupTest): + "Tests the various built-in functions of Tag objects." + + def testLen(self): + soup = BeautifulSoup("<top>1<b>2</b>3</top>") + self.assertEquals(len(soup.top), 3) + +class StringEmUp(SoupTest): + "Tests the use of 'string' as an alias for a tag's only content." + + def testString(self): + s = BeautifulSoup("<b>foo</b>") + self.assertEquals(s.b.string, 'foo') + + def testLackOfString(self): + s = BeautifulSoup("<b>f<i>e</i>o</b>") + self.assert_(not s.b.string) + +class ThatsMyLimit(SoupTest): + "Tests the limit argument." + + def testBasicLimits(self): + s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />') + self.assertEquals(len(s.findAll('br')), 4) + self.assertEquals(len(s.findAll('br', limit=2)), 2) + self.assertEquals(len(s('br', limit=2)), 2) + +class OnlyTheLonely(SoupTest): + "Tests the parseOnly argument to the constructor." + def setUp(self): + x = [] + for i in range(1,6): + x.append('<a id="%s">' % i) + for j in range(100,103): + x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j)) + x.append('</a>') + self.x = ''.join(x) + + def testOnly(self): + strainer = SoupStrainer("b") + soup = BeautifulSoup(self.x, parseOnlyThese=strainer) + self.assertEquals(len(soup), 15) + + strainer = SoupStrainer(id=re.compile("100.*")) + soup = BeautifulSoup(self.x, parseOnlyThese=strainer) + self.assertEquals(len(soup), 5) + + strainer = SoupStrainer(text=re.compile("10[01].*")) + soup = BeautifulSoup(self.x, parseOnlyThese=strainer) + self.assertEquals(len(soup), 10) + + strainer = SoupStrainer(text=lambda(x):x[8]=='3') + soup = BeautifulSoup(self.x, parseOnlyThese=strainer) + self.assertEquals(len(soup), 3) + +class PickleMeThis(SoupTest): + "Testing features like pickle and deepcopy." + + def setUp(self): + self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" +"http://www.w3.org/TR/REC-html40/transitional.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<title>Beautiful Soup: We called him Tortoise because he taught us.</title> +<link rev="made" href="mailto:leonardr@segfault.org"> +<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> +<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> +<meta name="author" content="Leonard Richardson"> +</head> +<body> +<a href="foo">foo</a> +<a href="foo"><b>bar</b></a> +</body> +</html>""" + + self.soup = BeautifulSoup(self.page) + + def testPickle(self): + import pickle + dumped = pickle.dumps(self.soup, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), self.soup.decode()) + + def testDeepcopy(self): + from copy import deepcopy + deepcopy(BeautifulSoup("<a></a>")) + copied = deepcopy(self.soup) + self.assertEqual(copied.decode(), self.soup.decode()) + + def testUnicodePickle(self): + import cPickle as pickle + html = "<b>" + chr(0xc3) + "</b>" + soup = BeautifulSoup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) + + +class WriteOnlyCode(SoupTest): + "Testing the modification of the tree." + + def testModifyAttributes(self): + soup = BeautifulSoup('<a id="1"></a>') + soup.a['id'] = 2 + self.assertEqual(soup.decode(), '<a id="2"></a>') + del(soup.a['id']) + self.assertEqual(soup.decode(), '<a></a>') + soup.a['id2'] = 'foo' + self.assertEqual(soup.decode(), '<a id2="foo"></a>') + + def testNewTagCreation(self): + "Makes sure tags don't step on each others' toes." + soup = BeautifulSoup() + builder = HTMLParserTreeBuilder() + a = Tag(soup, builder, 'a') + ol = Tag(soup, builder, 'ol') + a['href'] = 'http://foo.com/' + self.assertRaises(KeyError, lambda : ol['href']) + + def testTagReplacement(self): + # Make sure you can replace an element with itself. + text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" + soup = BeautifulSoup(text) + c = soup.c + soup.c.replaceWith(c) + self.assertEquals(soup.decode(), text) + + # A very simple case + soup = BeautifulSoup("<b>Argh!</b>") + soup.find(text="Argh!").replaceWith("Hooray!") + newText = soup.find(text="Hooray!") + b = soup.b + self.assertEqual(newText.previous, b) + self.assertEqual(newText.parent, b) + self.assertEqual(newText.previous.next, newText) + self.assertEqual(newText.next, None) + + # A more complex case + soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>") + soup.b.insert(1, "Hooray!") + newText = soup.find(text="Hooray!") + self.assertEqual(newText.previous, "Argh!") + self.assertEqual(newText.previous.next, newText) + + self.assertEqual(newText.previousSibling, "Argh!") + self.assertEqual(newText.previousSibling.nextSibling, newText) + + self.assertEqual(newText.nextSibling, None) + self.assertEqual(newText.next, soup.c) + + text = "<html>There's <b>no</b> business like <b>show</b> business</html>" + soup = BeautifulSoup(text) + no, show = soup.findAll('b') + show.replaceWith(no) + self.assertEquals(soup.decode(), "<html>There's business like <b>no</b> business</html>") + + # Even more complex + soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>") + builder = HTMLParserTreeBuilder() + tag = Tag(soup, builder, 'magictag') + tag.insert(0, "the") + soup.a.insert(1, tag) + + b = soup.b + c = soup.c + theText = tag.find(text=True) + findText = b.find(text="Find") + + self.assertEqual(findText.next, tag) + self.assertEqual(tag.previous, findText) + self.assertEqual(b.nextSibling, tag) + self.assertEqual(tag.previousSibling, b) + self.assertEqual(tag.nextSibling, c) + self.assertEqual(c.previousSibling, tag) + + self.assertEqual(theText.next, c) + self.assertEqual(c.previous, theText) + + # Aand... incredibly complex. + soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") + f = soup.f + a = soup.a + c = soup.c + e = soup.e + weText = a.find(text="We") + soup.b.replaceWith(soup.f) + self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>") + + self.assertEqual(f.previous, weText) + self.assertEqual(weText.next, f) + self.assertEqual(f.previousSibling, weText) + self.assertEqual(f.nextSibling, None) + self.assertEqual(weText.nextSibling, f) + + def testAppend(self): + doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>" + soup = BeautifulSoup(doc) + second_para = soup('p')[1] + bold = soup.find('b') + soup('p')[1].append(soup.find('b')) + self.assertEqual(bold.parent, second_para) + self.assertEqual(soup.decode(), + "<p>Don't leave me .</p> " + "<p>Don't leave me.<b>here</b></p>") + + def testTagExtraction(self): + # A very simple case + text = '<html><div id="nav">Nav crap</div>Real content here.</html>' + soup = BeautifulSoup(text) + extracted = soup.find("div", id="nav").extract() + self.assertEqual(soup.decode(), "<html>Real content here.</html>") + self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') + + # A simple case, a more complex test. + text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>" + soup = BeautifulStoneSoup(text) + doc = soup.doc + numbers, roman, letters = soup("a") + + self.assertEqual(roman.parent, doc) + oldPrevious = roman.previous + endOfThisTag = roman.nextSibling.previous + self.assertEqual(oldPrevious, "2") + self.assertEqual(roman.next, "i") + self.assertEqual(endOfThisTag, "ii") + self.assertEqual(roman.previousSibling, numbers) + self.assertEqual(roman.nextSibling, letters) + + roman.extract() + self.assertEqual(roman.parent, None) + self.assertEqual(roman.previous, None) + self.assertEqual(roman.next, "i") + self.assertEqual(letters.previous, '2') + self.assertEqual(roman.previousSibling, None) + self.assertEqual(roman.nextSibling, None) + self.assertEqual(endOfThisTag.next, None) + self.assertEqual(roman.b.contents[0].next, None) + self.assertEqual(numbers.nextSibling, letters) + self.assertEqual(letters.previousSibling, numbers) + self.assertEqual(len(doc.contents), 2) + self.assertEqual(doc.contents[0], numbers) + self.assertEqual(doc.contents[1], letters) + + # A more complex case. + text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3" + soup = BeautifulStoneSoup(text) + one = soup.find(text="1") + three = soup.find(text="3") + toExtract = soup.b + soup.b.extract() + self.assertEqual(one.next, three) + self.assertEqual(three.previous, one) + self.assertEqual(one.parent.nextSibling, three) + self.assertEqual(three.previousSibling, soup.a) + +class TheManWithoutAttributes(SoupTest): + "Test attribute access" + + def testHasKey(self): + text = "<foo attr='bar'>" + self.assertTrue(BeautifulSoup(text).foo.has_key('attr')) + +class QuoteMeOnThat(SoupTest): + "Test quoting" + def testQuotedAttributeValues(self): + self.assertSoupEquals("<foo attr='bar'></foo>", + '<foo attr="bar"></foo>') + + text = """<foo attr='bar "brawls" happen'>a</foo>""" + soup = BeautifulSoup(text) + self.assertEquals(soup.decode(), text) + + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""" + self.assertSoupEquals(soup.decode(), newText) + + self.assertSoupEquals('<this is="really messed up & stuff">', + '<this is="really messed up & stuff"></this>') + + + +class YoureSoLiteral(SoupTest): + "Test literal mode." + def testLiteralMode(self): + text = "<script>if (i<imgs.length)</script><b>Foo</b>" + soup = BeautifulSoup(text) + self.assertEqual(soup.script.contents[0], "if (i<imgs.length)") + self.assertEqual(soup.b.contents[0], "Foo") + + def testTextArea(self): + text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>" + soup = BeautifulSoup(text) + self.assertEqual(soup.textarea.contents[0], + "<b>This is an example of an HTML tag</b><&<&") + +class OperatorOverload(SoupTest): + "Our operators do it all! Call now!" + + def testTagNameAsFind(self): + "Tests that referencing a tag name as a member delegates to find()." + soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>') + self.assertEqual(soup.b.i, soup.find('b').find('i')) + self.assertEqual(soup.b.i.string, 'bar') + self.assertEqual(soup.b['id'], '1') + self.assertEqual(soup.b.contents[0], 'foo') + self.assert_(not soup.a) + + #Test the .fooTag variant of .foo. + self.assertEqual(soup.bTag.iTag.string, 'bar') + self.assertEqual(soup.b.iTag.string, 'bar') + self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag) + +class NestableEgg(SoupTest): + """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!""" + + def testParaInsideBlockquote(self): + soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar') + self.assertEqual(soup.blockquote.p.b.string, 'Foo') + self.assertEqual(soup.blockquote.b.string, 'Foo') + self.assertEqual(soup.find('p', recursive=False).string, 'Bar') + + def testNestedTables(self): + text = """<table id="1"><tr><td>Here's another table: + <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>""" + soup = BeautifulSoup(text) + self.assertEquals(soup.table.table.td.string, 'Juicy text') + self.assertEquals(len(soup.findAll('table')), 2) + self.assertEquals(len(soup.table.findAll('table')), 1) + self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name, + 'table') + + text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>" + soup = BeautifulSoup(text) + self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo") + + text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody> + <tfoot><tr>Baz</tr></tfoot></table>""" + soup = BeautifulSoup(text) + self.assertEquals(soup.table.thead.tr.contents[0], "Foo") + + def testBadNestedTables(self): + soup = BeautifulSoup("<table><tr><table><tr id='nested'>") + self.assertEquals(soup.table.tr.table.tr['id'], 'nested') + +class CleanupOnAisleFour(SoupTest): + """Here we test cleanup of text that breaks HTMLParser or is just + obnoxious.""" + + def testSelfClosingtag(self): + self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(), + '<br />') + + self.assertSoupEquals('<p>test1<br/>test2</p>', + '<p>test1<br />test2</p>') + + text = '<p>test1<selfclosing>test2' + soup = BeautifulStoneSoup(text) + self.assertEqual(soup.decode(), + '<p>test1<selfclosing>test2</selfclosing></p>') + + builder = HTMLParserXMLTreeBuilder(selfClosingTags='selfclosing') + soup = BeautifulSoup(text, builder) + self.assertEqual(soup.decode(), + '<p>test1<selfclosing />test2</p>') + + def testSelfClosingTagOrNot(self): + text = "<item><link>http://foo.com/</link></item>" + self.assertEqual(BeautifulStoneSoup(text).decode(), text) + self.assertEqual(BeautifulSoup(text).decode(), + '<item><link />http://foo.com/</item>') + + def testBooleanAttributes(self): + text = "<td nowrap>foo</td>" + self.assertSoupEquals(text, text) + + def testCData(self): + xml = "<root>foo<![CDATA[foobar]]>bar</root>" + self.assertSoupEquals(xml, xml) + r = re.compile("foo.*bar") + soup = BeautifulSoup(xml) + self.assertEquals(soup.find(text=r).string, "foobar") + self.assertEquals(soup.find(text=r).__class__, CData) + + def testComments(self): + xml = "foo<!--foobar-->baz" + self.assertSoupEquals(xml) + r = re.compile("foo.*bar") + soup = BeautifulSoup(xml) + self.assertEquals(soup.find(text=r).string, "foobar") + self.assertEquals(soup.find(text="foobar").__class__, Comment) + + def testDeclaration(self): + xml = "foo<!DOCTYPE foobar>baz" + self.assertSoupEquals(xml) + r = re.compile(".*foo.*bar") + soup = BeautifulSoup(xml) + text = "DOCTYPE foobar" + self.assertEquals(soup.find(text=r).string, text) + self.assertEquals(soup.find(text=text).__class__, Declaration) + + namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">' + '<html>foo</html>') + soup = BeautifulSoup(namespaced_doctype) + self.assertEquals(soup.contents[0], + 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"') + self.assertEquals(soup.html.contents[0], 'foo') + + def testEntityConversions(self): + text = "<<sacré bleu!>>" + soup = BeautifulStoneSoup(text) + self.assertSoupEquals(text) + + xmlEnt = Entities.XML_ENTITIES + htmlEnt = Entities.HTML_ENTITIES + xhtmlEnt = Entities.XHTML_ENTITIES + + xmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xmlEnt) + htmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=htmlEnt) + xhtmlBuilder = HTMLParserXMLTreeBuilder(convertEntities=xhtmlEnt) + + soup = BeautifulStoneSoup(text, xmlBuilder) + self.assertEquals(soup.decode(), "<<sacré bleu!>>") + + soup = BeautifulStoneSoup(text, xmlBuilder) + self.assertEquals(soup.decode(), "<<sacré bleu!>>") + + soup = BeautifulStoneSoup(text, htmlBuilder) + self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>") + + # Make sure the "XML", "HTML", and "XHTML" settings work. + text = "<™'" + soup = BeautifulStoneSoup(text, xmlBuilder) + self.assertEquals(soup.decode(), u"<™'") + + soup = BeautifulStoneSoup(text, htmlBuilder) + self.assertEquals(soup.decode(), u"<\u2122'") + + soup = BeautifulStoneSoup(text, xhtmlBuilder) + self.assertEquals(soup.decode(), u"<\u2122'") + + def testNonBreakingSpaces(self): + builder = HTMLParserTreeBuilder( + convertEntities=BeautifulStoneSoup.HTML_ENTITIES) + soup = BeautifulSoup("<a> </a>", builder) + self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>") + + def testWhitespaceInDeclaration(self): + self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>') + + def testJunkInDeclaration(self): + self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a') + + def testIncompleteDeclaration(self): + self.assertSoupEquals('a<!b <p>c') + + def testEntityReplacement(self): + self.assertSoupEquals('<b>hello there</b>') + + def testEntitiesInAttributeValues(self): + self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', + encoding='utf-8') + self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', + encoding='utf-8') + + builder = HTMLParserTreeBuilder(convertEntities=Entities.HTML_ENTITIES) + soup = BeautifulSoup('<x t=">™">', builder) + self.assertEquals(soup.decode(), u'<x t=">\u2122"></x>') + + uri = "http://crummy.com?sacré&bleu" + link = '<a href="%s"></a>' % uri + + soup = BeautifulSoup(link, builder) + self.assertEquals(soup.decode(), + link.replace("é", u"\xe9")) + + uri = "http://crummy.com?sacré&bleu" + link = '<a href="%s"></a>' % uri + soup = BeautifulSoup(link, builder) + self.assertEquals(soup.a['href'], + uri.replace("é", u"\xe9")) + + def testNakedAmpersands(self): + builder = HTMLParserXMLTreeBuilder(convertEntities=Entities.HTML_ENTITIES) + soup = BeautifulStoneSoup("AT&T ", builder) + self.assertEquals(soup.decode(), 'AT&T ') + + nakedAmpersandInASentence = "AT&T was Ma Bell" + soup = BeautifulStoneSoup(nakedAmpersandInASentence, builder) + self.assertEquals(soup.decode(), \ + nakedAmpersandInASentence.replace('&','&')) + + invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>' + validURL = invalidURL.replace('&','&') + soup = BeautifulStoneSoup(invalidURL) + self.assertEquals(soup.decode(), validURL) + + soup = BeautifulStoneSoup(validURL) + self.assertEquals(soup.decode(), validURL) + + +class EncodeRed(SoupTest): + """Tests encoding conversion, Unicode conversion, and Microsoft + smart quote fixes.""" + + def testUnicodeDammitStandalone(self): + markup = "<foo>\x92</foo>" + dammit = UnicodeDammit(markup) + self.assertEquals(dammit.unicode, "<foo>’</foo>") + + hebrew = "\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEquals(dammit.originalEncoding, 'iso-8859-8') + + def testGarbageInGarbageOut(self): + ascii = "<foo>a</foo>" + asciiSoup = BeautifulStoneSoup(ascii) + self.assertEquals(ascii, asciiSoup.decode()) + + unicodeData = u"<foo>\u00FC</foo>" + utf8 = unicodeData.encode("utf-8") + self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') + + unicodeSoup = BeautifulStoneSoup(unicodeData) + self.assertEquals(unicodeData, unicodeSoup.decode()) + self.assertEquals(unicodeSoup.foo.string, u'\u00FC') + + utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8') + self.assertEquals(utf8, utf8Soup.encode('utf-8')) + self.assertEquals(utf8Soup.originalEncoding, "utf-8") + + utf8Soup = BeautifulStoneSoup(unicodeData) + self.assertEquals(utf8, utf8Soup.encode('utf-8')) + self.assertEquals(utf8Soup.originalEncoding, None) + + + def testHandleInvalidCodec(self): + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + soup = BeautifulSoup(u"RĂ€ksmörgĂ„s".encode("utf-8"), + fromEncoding=bad_encoding) + self.assertEquals(soup.originalEncoding, 'utf-8') + + def testUnicodeSearch(self): + html = u'<html><body><h1>RĂ€ksmörgĂ„s</h1></body></html>' + soup = BeautifulSoup(html) + self.assertEqual(soup.find(text=u'RĂ€ksmörgĂ„s'),u'RĂ€ksmörgĂ„s') + + def testRewrittenXMLHeader(self): + euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' + utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" + soup = BeautifulStoneSoup(euc_jp) + if soup.originalEncoding != "euc-jp": + raise Exception("Test failed when parsing euc-jp document. " + "If you're running Python >=2.4, or you have " + "cjkcodecs installed, this is a real problem. " + "Otherwise, ignore it.") + + self.assertEquals(soup.originalEncoding, "euc-jp") + self.assertEquals(soup.renderContents('utf-8'), utf8) + + old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" + new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>" + self.assertSoupEquals(old_text, new_text) + + def testRewrittenMetaTag(self): + no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' + soup = BeautifulSoup(no_shift_jis_html) + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) + self.assertEquals(soup.contents[0].name, 'pre') + + meta_tag = ('<meta content="text/html; charset=x-sjis" ' + 'http-equiv="Content-type" />') + shift_jis_html = ( + '<html><head>\n%s\n' + '<meta http-equiv="Content-language" content="ja" />' + '</head><body><pre>\n' + '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' + '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' + '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' + '</pre></body></html>') % meta_tag + soup = BeautifulSoup(shift_jis_html) + if soup.originalEncoding != "shift-jis": + raise Exception("Test failed when parsing shift-jis document " + "with meta tag '%s'." + "If you're running Python >=2.4, or you have " + "cjkcodecs installed, this is a real problem. " + "Otherwise, ignore it." % meta_tag) + self.assertEquals(soup.originalEncoding, "shift-jis") + + content_type_tag = soup.meta['content'] + self.assertEquals(content_type_tag[content_type_tag.find('charset='):], + 'charset=%SOUP-ENCODING%') + content_type = str(soup.meta) + index = content_type.find('charset=') + self.assertEqual(content_type[index:index+len('charset=utf8')+1], + 'charset=utf-8') + content_type = soup.meta.encode('shift-jis') + index = content_type.find('charset=') + self.assertEqual(content_type[index:index+len('charset=shift-jis')], + 'charset=shift-jis'.encode()) + + self.assertEquals(soup.encode('utf-8'), ( + '<html><head>\n' + '<meta content="text/html; charset=utf-8" ' + 'http-equiv="Content-type" />\n' + '<meta http-equiv="Content-language" content="ja" />' + '</head><body><pre>\n' + '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' + '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' + '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' + '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' + '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' + '</pre></body></html>')) + self.assertEquals(soup.encode("shift-jis"), + shift_jis_html.replace('x-sjis'.encode(), + 'shift-jis'.encode())) + + isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" + soup = BeautifulSoup(isolatin) + + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) + utf8 = utf8.replace("\xe9", "\xc3\xa9") + self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') + + def testHebrew(self): + iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' + utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' + soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") + self.assertEquals(soup.encode('utf-8'), utf8) + + def testSmartQuotesNotSoSmartAnymore(self): + self.assertSoupEquals("\x91Foo\x92 <!--blah-->", + '‘Foo’ <!--blah-->') + + def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): + smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" + soup = BeautifulSoup(smartQuotes) + self.assertEquals(soup.decode(), + 'Il a dit, ‹Sacré bleu!›') + builder = HTMLParserTreeBuilder(convertEntities="html") + soup = BeautifulSoup(smartQuotes, builder) + self.assertEquals(soup.encode('utf-8'), + 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') + + def testDontSeeSmartQuotesWhereThereAreNone(self): + utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + self.assertSoupEquals(utf_8, encoding='utf-8') + + +class Whitewash(SoupTest): + """Test whitespace preservation.""" + + def testPreservedWhitespace(self): + self.assertSoupEquals("<pre> </pre>") + self.assertSoupEquals("<pre> woo </pre>") + + def testCollapsedWhitespace(self): + self.assertSoupEquals("<p> </p>", "<p> </p>") + + +class AlternateBuilders(SoupTest): + """Test alternate builders.""" + + def testICantBelieveItsValidHTML(self): + builder = ICantBelieveItsValidHTMLTreeBuilder() + markup = "<b>Foo<b>Bar</b></b>" + + soup = BeautifulSoup(markup) + self.assertEquals(soup.decode(), "<b>Foo</b><b>Bar</b>") + + soup = BeautifulSoup(markup, builder=builder) + self.assertEquals(soup.decode(), markup) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/beautifulsoup/util.py b/src/beautifulsoup/util.py new file mode 100644 index 0000000..693a7e2 --- /dev/null +++ b/src/beautifulsoup/util.py @@ -0,0 +1,29 @@ +# Helper functions and mixin classes for Beautiful Soup + +import types +try: + set +except NameError: + from sets import Set as set + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return ((hasattr(l, '__iter__') and not isString(l)) + or (type(l) in (types.ListType, types.TupleType))) + +def isString(s): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is stringlike.""" + try: + return isinstance(s, unicode) or isinstance(s, basestring) + except NameError: + return isinstance(s, str) + +def buildSet(args=None): + """Turns a list or a string into a set.""" + if isinstance(args, str): + return set([args]) + if args is None: + return set() + return set(args) |