diff options
Diffstat (limited to 'beautifulsoup')
-rw-r--r-- | beautifulsoup/__init__.py | 2 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 98 | ||||
-rw-r--r-- | beautifulsoup/element.py | 180 |
3 files changed, 194 insertions, 86 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index ddf51f9..e23c9d9 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -171,7 +171,7 @@ class BeautifulStoneSoup(Tag): else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.builder.smart_quotes_to, isHTML=isHTML) + isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py new file mode 100644 index 0000000..86de5ec --- /dev/null +++ b/beautifulsoup/builder/__init__.py @@ -0,0 +1,98 @@ +from beautifulsoup.element import Entities + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + ] + + +class TreeBuilder(Entities): + """Turn a document into a Beautiful Soup object tree.""" + + assume_html = False + + def __init__(self): + self.soup = None + + def isSelfClosingTag(self, name): + return name in self.self_closing_tags + + def reset(self): + pass + + def feed(self, markup): + raise NotImplementedError() + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in attrs.items()) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are self-closing tags. + """ + + assume_html = True + + preserve_whitespace_tags = set(['pre', 'textarea']) + self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 8749114..39e0e06 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -161,77 +161,88 @@ class PageElement: """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) - def findNext(self, name=None, attrs={}, text=None, **kwargs): + def find_next(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" - return self._findOne(self.findAllNext, name, attrs, text, **kwargs) + return self._findOne(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 - def findAllNext(self, name=None, attrs={}, text=None, limit=None, + def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.nextGenerator, + return self._find_all(name, attrs, text, limit, self.next_elements, **kwargs) + findAllNext = find_all_next # BS3 - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" - return self._findOne(self.findNextSiblings, name, attrs, text, + return self._findOne(self.find_next_siblings, name, attrs, text, **kwargs) + findNextSibling = find_next_sibling # BS3 - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.nextSiblingGenerator, **kwargs) - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 - def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + def find_previous(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" - return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) + return self._findOne( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.previousGenerator, + return self._find_all(name, attrs, text, limit, self.previous_elements, **kwargs) - fetchPrevious = findAllPrevious # Compatibility with pre-3.x + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" - return self._findOne(self.findPreviousSiblings, name, attrs, text, + return self._findOne(self.find_previous_siblings, name, attrs, text, **kwargs) + findPreviousSibling = find_previous_sibling # BS3 - def findPreviousSiblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 - def findParent(self, name=None, attrs={}, **kwargs): + def find_parent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given criteria.""" # NOTE: We can't use _findOne because findParents takes a different # set of arguments. r = None - l = self.findParents(name, attrs, 1) + l = self.find_parents(name, attrs, 1) if l: r = l[0] return r + findParent = find_parent # BS3 - def findParents(self, name=None, attrs={}, limit=None, **kwargs): + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): """Returns the parents of this Tag that match the given criteria.""" - return self._findAll(name, attrs, None, limit, self.parentGenerator, + return self._find_all(name, attrs, None, limit, self.parents, **kwargs) - fetchParents = findParents # Compatibility with pre-3.x + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 #These methods do the real heavy lifting. @@ -242,7 +253,7 @@ class PageElement: r = l[0] return r - def _findAll(self, name, attrs, text, limit, generator, **kwargs): + def _find_all(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." if isinstance(name, SoupStrainer): @@ -251,10 +262,9 @@ class PageElement: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) - g = generator() while True: try: - i = g.next() + i = generator.next() except StopIteration: break if i: @@ -265,38 +275,60 @@ class PageElement: break return results - #These Generators can be used to navigate starting from both + #These generators can be used to navigate starting from both #NavigableStrings and Tags. - def nextGenerator(self): + @property + def next_elements(self): i = self while i: i = i.next yield i - def nextSiblingGenerator(self): + @property + def next_siblings(self): i = self while i: i = i.nextSibling yield i - def previousGenerator(self): + @property + def previous_elements(self): i = self while i: i = i.previous yield i - def previousSiblingGenerator(self): + @property + def previous_siblings(self): i = self while i: i = i.previousSibling yield i - def parentGenerator(self): + @property + def parents(self): i = self while i: i = i.parent yield i + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + # Utility methods def substituteEncoding(self, str, encoding=None): encoding = encoding or "utf-8" @@ -389,37 +421,12 @@ class Tag(PageElement, Entities): """Represents a found HTML tag with its attributes and contents.""" - def _convertEntities(self, builder, match): - """Used in a call to re.sub to replace HTML, XML, and numeric - entities with the appropriate Unicode characters. If HTML - entities are being converted, any unrecognized entities are - escaped.""" - x = match.group(1) - if builder.convert_html_entities and x in name2codepoint: - return unichr(name2codepoint[x]) - elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: - if builder.convert_xml_entities: - return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] - else: - return u'&%s;' % x - elif len(x) > 0 and x[0] == '#': - # Handle numeric entities - if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) - else: - return unichr(int(x[1:])) - - elif self.escapeUnrecognizedEntities: - return u'&%s;' % x - else: - return u'&%s;' % x - def __init__(self, parser, builder, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected + # chunks be garbage-collected. self.parserClass = parser.__class__ self.name = name self.isSelfClosing = builder.isSelfClosingTag(name) @@ -432,19 +439,11 @@ class Tag(PageElement, Entities): self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False - self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - - # Convert any HTML, XML, or numeric entities in the attribute values. - convert_one = lambda x: self._convertEntities(parser.builder, x) - def convert(kval): - k, val = kval - if val is None: - return kval - return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", convert_one, val)) + if isinstance(attrs, types.DictType): - self.attrs = [convert(kv) for kv in attrs.items()] + self.attrs = [kv for kv in attrs.items()] else: - self.attrs = map(convert, attrs) + self.attrs = list(attrs) @property def string(self): @@ -519,9 +518,9 @@ class Tag(PageElement, Entities): def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its - findAll() method. Eg. tag('a') returns a list of all the A tags + find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" - return apply(self.findAll, args, kwargs) + return apply(self.find_all, args, kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) @@ -702,14 +701,14 @@ class Tag(PageElement, Entities): """Return only the first child of this Tag matching the given criteria.""" r = None - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find - def findAll(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. @@ -719,11 +718,12 @@ class Tag(PageElement, Entities): callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" - generator = self.recursiveChildGenerator + generator = self.recursive_children if not recursive: - generator = self.childGenerator - return self._findAll(name, attrs, text, limit, generator, **kwargs) - findChildren = findAll + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 #Private methods @@ -737,12 +737,14 @@ class Tag(PageElement, Entities): return self.attrMap #Generator methods - def childGenerator(self): + @property + def children(self): for i in range(0, len(self.contents)): yield self.contents[i] raise StopIteration - def recursiveChildGenerator(self): + @property + def recursive_children(self): if not len(self.contents): raise StopIteration stopNode = self._lastRecursiveChild().next @@ -751,6 +753,14 @@ class Tag(PageElement, Entities): yield current current = current.next + # Old names for backwards compatibility + def childGenerator(self): + return self.children + + def recursiveChildGenerator(self): + return self.recursive_children + + # Next, a couple classes to represent queries and their results. class SoupStrainer: """Encapsulates a number of ways of matching a markup element (tag or |