summaryrefslogtreecommitdiff
path: root/beautifulsoup
diff options
context:
space:
mode:
Diffstat (limited to 'beautifulsoup')
-rw-r--r--beautifulsoup/__init__.py2
-rw-r--r--beautifulsoup/builder/__init__.py98
-rw-r--r--beautifulsoup/element.py180
3 files changed, 194 insertions, 86 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index ddf51f9..e23c9d9 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -171,7 +171,7 @@ class BeautifulStoneSoup(Tag):
else:
dammit = UnicodeDammit\
(markup, [self.fromEncoding, inDocumentEncoding],
- smartQuotesTo=self.builder.smart_quotes_to, isHTML=isHTML)
+ isHTML=isHTML)
markup = dammit.unicode
self.originalEncoding = dammit.originalEncoding
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
new file mode 100644
index 0000000..86de5ec
--- /dev/null
+++ b/beautifulsoup/builder/__init__.py
@@ -0,0 +1,98 @@
+from beautifulsoup.element import Entities
+
+__all__ = [
+ 'HTMLTreeBuilder',
+ 'SAXTreeBuilder',
+ 'TreeBuilder',
+ ]
+
+
+class TreeBuilder(Entities):
+ """Turn a document into a Beautiful Soup object tree."""
+
+ assume_html = False
+
+ def __init__(self):
+ self.soup = None
+
+ def isSelfClosingTag(self, name):
+ return name in self.self_closing_tags
+
+ def reset(self):
+ pass
+
+ def feed(self, markup):
+ raise NotImplementedError()
+
+ def test_fragment_to_document(self, fragment):
+ """Wrap an HTML fragment to make it look like a document.
+
+ Different parsers do this differently. For instance, lxml
+ introduces an empty <head> tag, and html5lib
+ doesn't. Abstracting this away lets us write simple tests
+ which run HTML fragments through the parser and compare the
+ results against other HTML fragments.
+
+ This method should not be used outside of tests.
+ """
+ return fragment
+
+
+class SAXTreeBuilder(TreeBuilder):
+ """A Beautiful Soup treebuilder that listens for SAX events."""
+
+ def feed(self, markup):
+ raise NotImplementedError()
+
+ def close(self):
+ pass
+
+ def startElement(self, name, attrs):
+ attrs = dict((key[1], value) for key, value in attrs.items())
+ #print "Start %s, %r" % (name, attrs)
+ self.soup.handle_starttag(name, attrs)
+
+ def endElement(self, name):
+ #print "End %s" % name
+ self.soup.handle_endtag(name)
+
+ def startElementNS(self, nsTuple, nodeName, attrs):
+ # Throw away (ns, nodeName) for now.
+ self.startElement(nodeName, attrs)
+
+ def endElementNS(self, nsTuple, nodeName):
+ # Throw away (ns, nodeName) for now.
+ self.endElement(nodeName)
+ #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+ def startPrefixMapping(self, prefix, nodeValue):
+ # Ignore the prefix for now.
+ pass
+
+ def endPrefixMapping(self, prefix):
+ # Ignore the prefix for now.
+ # handler.endPrefixMapping(prefix)
+ pass
+
+ def characters(self, content):
+ self.soup.handle_data(content)
+
+ def startDocument(self):
+ pass
+
+ def endDocument(self):
+ pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+ """This TreeBuilder knows facts about HTML.
+
+ Such as which tags are self-closing tags.
+ """
+
+ assume_html = True
+
+ preserve_whitespace_tags = set(['pre', 'textarea'])
+ self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base'])
+
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 8749114..39e0e06 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -161,77 +161,88 @@ class PageElement:
"""Appends the given tag to the contents of this tag."""
self.insert(len(self.contents), tag)
- def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ def find_next(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears after this Tag in the document."""
- return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+ return self._findOne(self.find_all_next, name, attrs, text, **kwargs)
+ findNext = find_next # BS3
- def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+ def find_all_next(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
"""Returns all items that match the given criteria and appear
after this Tag in the document."""
- return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ return self._find_all(name, attrs, text, limit, self.next_elements,
**kwargs)
+ findAllNext = find_all_next # BS3
- def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears after this Tag in the document."""
- return self._findOne(self.findNextSiblings, name, attrs, text,
+ return self._findOne(self.find_next_siblings, name, attrs, text,
**kwargs)
+ findNextSibling = find_next_sibling # BS3
- def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
+ def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
"""Returns the siblings of this Tag that match the given
criteria and appear after this Tag in the document."""
- return self._findAll(name, attrs, text, limit,
- self.nextSiblingGenerator, **kwargs)
- fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+ return self._find_all(name, attrs, text, limit,
+ self.next_siblings, **kwargs)
+ findNextSiblings = find_next_siblings # BS3
+ fetchNextSiblings = find_next_siblings # BS2
- def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ def find_previous(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears before this Tag in the document."""
- return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+ return self._findOne(
+ self.find_all_previous, name, attrs, text, **kwargs)
+ findPrevious = find_previous # BS3
- def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+ def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
"""Returns all items that match the given criteria and appear
before this Tag in the document."""
- return self._findAll(name, attrs, text, limit, self.previousGenerator,
+ return self._find_all(name, attrs, text, limit, self.previous_elements,
**kwargs)
- fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+ findAllPrevious = find_all_previous # BS3
+ fetchPrevious = find_all_previous # BS2
- def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
given criteria and appears before this Tag in the document."""
- return self._findOne(self.findPreviousSiblings, name, attrs, text,
+ return self._findOne(self.find_previous_siblings, name, attrs, text,
**kwargs)
+ findPreviousSibling = find_previous_sibling # BS3
- def findPreviousSiblings(self, name=None, attrs={}, text=None,
- limit=None, **kwargs):
+ def find_previous_siblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
"""Returns the siblings of this Tag that match the given
criteria and appear before this Tag in the document."""
- return self._findAll(name, attrs, text, limit,
- self.previousSiblingGenerator, **kwargs)
- fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+ return self._find_all(name, attrs, text, limit,
+ self.previous_siblings, **kwargs)
+ findPreviousSiblings = find_previous_siblings # BS3
+ fetchPreviousSiblings = find_previous_siblings # BS2
- def findParent(self, name=None, attrs={}, **kwargs):
+ def find_parent(self, name=None, attrs={}, **kwargs):
"""Returns the closest parent of this Tag that matches the given
criteria."""
# NOTE: We can't use _findOne because findParents takes a different
# set of arguments.
r = None
- l = self.findParents(name, attrs, 1)
+ l = self.find_parents(name, attrs, 1)
if l:
r = l[0]
return r
+ findParent = find_parent # BS3
- def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+ def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
"""Returns the parents of this Tag that match the given
criteria."""
- return self._findAll(name, attrs, None, limit, self.parentGenerator,
+ return self._find_all(name, attrs, None, limit, self.parents,
**kwargs)
- fetchParents = findParents # Compatibility with pre-3.x
+ findParents = find_parents # BS3
+ fetchParents = find_parents # BS2
#These methods do the real heavy lifting.
@@ -242,7 +253,7 @@ class PageElement:
r = l[0]
return r
- def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ def _find_all(self, name, attrs, text, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
if isinstance(name, SoupStrainer):
@@ -251,10 +262,9 @@ class PageElement:
# Build a SoupStrainer
strainer = SoupStrainer(name, attrs, text, **kwargs)
results = ResultSet(strainer)
- g = generator()
while True:
try:
- i = g.next()
+ i = generator.next()
except StopIteration:
break
if i:
@@ -265,38 +275,60 @@ class PageElement:
break
return results
- #These Generators can be used to navigate starting from both
+ #These generators can be used to navigate starting from both
#NavigableStrings and Tags.
- def nextGenerator(self):
+ @property
+ def next_elements(self):
i = self
while i:
i = i.next
yield i
- def nextSiblingGenerator(self):
+ @property
+ def next_siblings(self):
i = self
while i:
i = i.nextSibling
yield i
- def previousGenerator(self):
+ @property
+ def previous_elements(self):
i = self
while i:
i = i.previous
yield i
- def previousSiblingGenerator(self):
+ @property
+ def previous_siblings(self):
i = self
while i:
i = i.previousSibling
yield i
- def parentGenerator(self):
+ @property
+ def parents(self):
i = self
while i:
i = i.parent
yield i
+ # Old non-property versions of the generators, for backwards
+ # compatibility with BS3.
+ def nextGenerator(self):
+ return self.next_elements
+
+ def nextSiblingGenerator(self):
+ return self.next_siblings
+
+ def previousGenerator(self):
+ return self.previous_elements
+
+ def previousSiblingGenerator(self):
+ return self.previous_siblings
+
+ def parentGenerator(self):
+ return self.parents
+
# Utility methods
def substituteEncoding(self, str, encoding=None):
encoding = encoding or "utf-8"
@@ -389,37 +421,12 @@ class Tag(PageElement, Entities):
"""Represents a found HTML tag with its attributes and contents."""
- def _convertEntities(self, builder, match):
- """Used in a call to re.sub to replace HTML, XML, and numeric
- entities with the appropriate Unicode characters. If HTML
- entities are being converted, any unrecognized entities are
- escaped."""
- x = match.group(1)
- if builder.convert_html_entities and x in name2codepoint:
- return unichr(name2codepoint[x])
- elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
- if builder.convert_xml_entities:
- return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
- else:
- return u'&%s;' % x
- elif len(x) > 0 and x[0] == '#':
- # Handle numeric entities
- if len(x) > 1 and x[1] == 'x':
- return unichr(int(x[2:], 16))
- else:
- return unichr(int(x[1:]))
-
- elif self.escapeUnrecognizedEntities:
- return u'&amp;%s;' % x
- else:
- return u'&%s;' % x
-
def __init__(self, parser, builder, name, attrs=None, parent=None,
previous=None):
"Basic constructor."
# We don't actually store the parser object: that lets extracted
- # chunks be garbage-collected
+ # chunks be garbage-collected.
self.parserClass = parser.__class__
self.name = name
self.isSelfClosing = builder.isSelfClosingTag(name)
@@ -432,19 +439,11 @@ class Tag(PageElement, Entities):
self.setup(parent, previous)
self.hidden = False
self.containsSubstitutions = False
- self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
-
- # Convert any HTML, XML, or numeric entities in the attribute values.
- convert_one = lambda x: self._convertEntities(parser.builder, x)
- def convert(kval):
- k, val = kval
- if val is None:
- return kval
- return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", convert_one, val))
+
if isinstance(attrs, types.DictType):
- self.attrs = [convert(kv) for kv in attrs.items()]
+ self.attrs = [kv for kv in attrs.items()]
else:
- self.attrs = map(convert, attrs)
+ self.attrs = list(attrs)
@property
def string(self):
@@ -519,9 +518,9 @@ class Tag(PageElement, Entities):
def __call__(self, *args, **kwargs):
"""Calling a tag like a function is the same as calling its
- findAll() method. Eg. tag('a') returns a list of all the A tags
+ find_all() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
- return apply(self.findAll, args, kwargs)
+ return apply(self.find_all, args, kwargs)
def __getattr__(self, tag):
#print "Getattr %s.%s" % (self.__class__, tag)
@@ -702,14 +701,14 @@ class Tag(PageElement, Entities):
"""Return only the first child of this Tag matching the given
criteria."""
r = None
- l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+ l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
if l:
r = l[0]
return r
findChild = find
- def findAll(self, name=None, attrs={}, recursive=True, text=None,
- limit=None, **kwargs):
+ def find_all(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
"""Extracts a list of Tag objects that match the given
criteria. You can specify the name of the Tag and any
attributes you want the Tag to have.
@@ -719,11 +718,12 @@ class Tag(PageElement, Entities):
callable that takes a string and returns whether or not the
string matches for some custom definition of 'matches'. The
same is true of the tag name."""
- generator = self.recursiveChildGenerator
+ generator = self.recursive_children
if not recursive:
- generator = self.childGenerator
- return self._findAll(name, attrs, text, limit, generator, **kwargs)
- findChildren = findAll
+ generator = self.children
+ return self._find_all(name, attrs, text, limit, generator, **kwargs)
+ findAll = find_all # BS3
+ findChildren = find_all # BS2
#Private methods
@@ -737,12 +737,14 @@ class Tag(PageElement, Entities):
return self.attrMap
#Generator methods
- def childGenerator(self):
+ @property
+ def children(self):
for i in range(0, len(self.contents)):
yield self.contents[i]
raise StopIteration
- def recursiveChildGenerator(self):
+ @property
+ def recursive_children(self):
if not len(self.contents):
raise StopIteration
stopNode = self._lastRecursiveChild().next
@@ -751,6 +753,14 @@ class Tag(PageElement, Entities):
yield current
current = current.next
+ # Old names for backwards compatibility
+ def childGenerator(self):
+ return self.children
+
+ def recursiveChildGenerator(self):
+ return self.recursive_children
+
+
# Next, a couple classes to represent queries and their results.
class SoupStrainer:
"""Encapsulates a number of ways of matching a markup element (tag or