summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2009-04-08 16:30:23 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2009-04-08 16:30:23 -0400
commit6d7cfb69c5c11cb98b10a2de0900b0f9b6ada6b8 (patch)
treed43d98672a964b66c0620d773185ee6bca692d74
parentcafaed172c6fd8b0668f1365bbb97331ca4fa1dc (diff)
Added an LXML builder and a stupid test.
-rw-r--r--BeautifulSoup.py50
-rw-r--r--lxml_builder.py36
-rw-r--r--lxml_test.py13
3 files changed, 81 insertions, 18 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index 67485d1..1fb9a1b 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -104,7 +104,7 @@ DEFAULT_OUTPUT_ENCODING = "utf-8"
# First, the classes that represent markup elements.
-class Entities:
+class KnowsEntitiesMixin:
"""Knows about XML entities."""
HTML_ENTITIES = "html"
@@ -469,7 +469,7 @@ class Declaration(NavigableString):
def decodeGivenEventualEncoding(self, eventualEncoding):
return u'<!' + self + u'>'
-class Tag(PageElement, Entities):
+class Tag(PageElement, KnowsEntitiesMixin):
"""Represents a found HTML tag with its attributes and contents."""
@@ -1004,7 +1004,24 @@ def buildTagMap(default, *args):
# Now, the parser classes.
-class XMLParserBuilder(HTMLParser, Entities):
+class TreeBuilder(KnowsEntitiesMixin):
+
+ smartQuotesTo = KnowsEntitiesMixin.XML_ENTITIES
+ PRESERVE_WHITESPACE_TAGS = set()
+ QUOTE_TAGS = set()
+ self_closing_tags = set()
+
+ def isSelfClosingTag(self, name):
+ return name in self.self_closing_tags
+
+ def reset(self):
+ pass
+
+ def close(self):
+ pass
+
+
+class XMLParserBuilder(TreeBuilder, HTMLParser):
"""
HTMLParser will process most bad HTML, and the BeautifulSoup
@@ -1035,13 +1052,9 @@ class XMLParserBuilder(HTMLParser, Entities):
lambda x: '<!' + x.group(1) + '>')
]
- PRESERVE_WHITESPACE_TAGS = set()
- QUOTE_TAGS = set()
- SELF_CLOSING_TAGS = set()
-
def __init__(self, convertEntities=None, markupMassage=True,
selfClosingTags=None,
- smartQuotesTo=Entities.XML_ENTITIES):
+ smartQuotesTo=KnowsEntitiesMixin.XML_ENTITIES):
HTMLParser.__init__(self)
self.soup = None
self.convertEntities = convertEntities
@@ -1093,7 +1106,7 @@ class XMLParserBuilder(HTMLParser, Entities):
def isSelfClosingTag(self, name):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
- return (name in self.SELF_CLOSING_TAGS
+ return (name in self.self_closing_tags
or name in self.instanceSelfClosingTags)
def handle_starttag(self, name, attrs):
@@ -1105,13 +1118,6 @@ class XMLParserBuilder(HTMLParser, Entities):
def handle_data(self, content):
self.soup.handle_data(content)
- def _toStringSubclass(self, text, subclass):
- """Adds a certain piece of text to the tree as a NavigableString
- subclass."""
- self.soup.endData()
- self.handle_data(text)
- self.soup.endData(subclass)
-
def handle_pi(self, text):
"""Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
@@ -1179,6 +1185,13 @@ class XMLParserBuilder(HTMLParser, Entities):
"Handle DOCTYPEs and the like as Declaration objects."
self._toStringSubclass(data, Declaration)
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.soup.endData()
+ self.handle_data(text)
+ self.soup.endData(subclass)
+
def parse_declaration(self, i):
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object."""
@@ -1204,7 +1217,7 @@ class HTMLParserBuilder(XMLParserBuilder):
PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
QUOTE_TAGS = set(['script', 'textarea'])
- SELF_CLOSING_TAGS = set(['br' , 'hr', 'input', 'img', 'meta',
+ self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
def __init__(self, *args, **kwargs):
@@ -1231,7 +1244,7 @@ class BeautifulStoneSoup(Tag):
"<foo><bar></bar></foo>".
[Another possible explanation is "<foo><bar /></foo>", but since
- this class defines no SELF_CLOSING_TAGS, it will never use that
+ this class defines no self_closing_tags, it will never use that
explanation.]
This class is useful for parsing XML or made-up markup languages,
@@ -1276,6 +1289,7 @@ class BeautifulStoneSoup(Tag):
except StopParsing:
pass
self.markup = None # The markup can now be GCed.
+ self.builder.close()
self.builder.soup = None
self.builder = None # So can the builder.
diff --git a/lxml_builder.py b/lxml_builder.py
new file mode 100644
index 0000000..95988e4
--- /dev/null
+++ b/lxml_builder.py
@@ -0,0 +1,36 @@
+from lxml import etree
+from BeautifulSoup import TreeBuilder
+
+class LXMLBuilder(TreeBuilder):
+
+ def __init__(self, parser_class=etree.XMLParser, self_closing_tags=[]):
+ self.parser = parser_class(target=self)
+ self.self_closing_tags = self_closing_tags
+ self.soup = None
+
+ def isSelfClosingTag(self, name):
+ return name in self.self_closing_tags
+
+ def feed(self, markup):
+ self.parser.feed(markup)
+ self.parser.close()
+
+ def start(self, name, attrs):
+ self.soup.handle_starttag(name, attrs)
+
+ def end(self, name):
+ self.soup.handle_endtag(name)
+
+ def data(self, content):
+ self.soup.handle_data(content)
+
+ def comment(self, content):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(content, Comment)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.soup.endData()
+ self.data(text)
+ self.soup.endData(subclass)
diff --git a/lxml_test.py b/lxml_test.py
new file mode 100644
index 0000000..99375dd
--- /dev/null
+++ b/lxml_test.py
@@ -0,0 +1,13 @@
+from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+from lxml_builder import LXMLBuilder
+from lxml import etree
+builder = LXMLBuilder()
+soup = BeautifulStoneSoup("<foo>bar</foo>", builder=builder)
+print soup.prettify()
+
+soup = BeautifulSoup("<foo>bar</foo>", builder=builder)
+print soup.prettify()
+
+builder = LXMLBuilder(parser_class=etree.HTMLParser, self_closing_tags=["br"])
+soup = BeautifulSoup("<html><head><title>test<body><h1>page<br />title</h1>", builder=builder)
+print soup.prettify()