diff options
-rw-r--r-- | BeautifulSoup.py | 50 | ||||
-rw-r--r-- | lxml_builder.py | 36 | ||||
-rw-r--r-- | lxml_test.py | 13 |
3 files changed, 81 insertions, 18 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py index 67485d1..1fb9a1b 100644 --- a/BeautifulSoup.py +++ b/BeautifulSoup.py @@ -104,7 +104,7 @@ DEFAULT_OUTPUT_ENCODING = "utf-8" # First, the classes that represent markup elements. -class Entities: +class KnowsEntitiesMixin: """Knows about XML entities.""" HTML_ENTITIES = "html" @@ -469,7 +469,7 @@ class Declaration(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!' + self + u'>' -class Tag(PageElement, Entities): +class Tag(PageElement, KnowsEntitiesMixin): """Represents a found HTML tag with its attributes and contents.""" @@ -1004,7 +1004,24 @@ def buildTagMap(default, *args): # Now, the parser classes. -class XMLParserBuilder(HTMLParser, Entities): +class TreeBuilder(KnowsEntitiesMixin): + + smartQuotesTo = KnowsEntitiesMixin.XML_ENTITIES + PRESERVE_WHITESPACE_TAGS = set() + QUOTE_TAGS = set() + self_closing_tags = set() + + def isSelfClosingTag(self, name): + return name in self.self_closing_tags + + def reset(self): + pass + + def close(self): + pass + + +class XMLParserBuilder(TreeBuilder, HTMLParser): """ HTMLParser will process most bad HTML, and the BeautifulSoup @@ -1035,13 +1052,9 @@ class XMLParserBuilder(HTMLParser, Entities): lambda x: '<!' + x.group(1) + '>') ] - PRESERVE_WHITESPACE_TAGS = set() - QUOTE_TAGS = set() - SELF_CLOSING_TAGS = set() - def __init__(self, convertEntities=None, markupMassage=True, selfClosingTags=None, - smartQuotesTo=Entities.XML_ENTITIES): + smartQuotesTo=KnowsEntitiesMixin.XML_ENTITIES): HTMLParser.__init__(self) self.soup = None self.convertEntities = convertEntities @@ -1093,7 +1106,7 @@ class XMLParserBuilder(HTMLParser, Entities): def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" - return (name in self.SELF_CLOSING_TAGS + return (name in self.self_closing_tags or name in self.instanceSelfClosingTags) def handle_starttag(self, name, attrs): @@ -1105,13 +1118,6 @@ class XMLParserBuilder(HTMLParser, Entities): def handle_data(self, content): self.soup.handle_data(content) - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.soup.endData() - self.handle_data(text) - self.soup.endData(subclass) - def handle_pi(self, text): """Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an @@ -1179,6 +1185,13 @@ class XMLParserBuilder(HTMLParser, Entities): "Handle DOCTYPEs and the like as Declaration objects." self._toStringSubclass(data, Declaration) + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.handle_data(text) + self.soup.endData(subclass) + def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.""" @@ -1204,7 +1217,7 @@ class HTMLParserBuilder(XMLParserBuilder): PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) QUOTE_TAGS = set(['script', 'textarea']) - SELF_CLOSING_TAGS = set(['br' , 'hr', 'input', 'img', 'meta', + self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) def __init__(self, *args, **kwargs): @@ -1231,7 +1244,7 @@ class BeautifulStoneSoup(Tag): "<foo><bar></bar></foo>". [Another possible explanation is "<foo><bar /></foo>", but since - this class defines no SELF_CLOSING_TAGS, it will never use that + this class defines no self_closing_tags, it will never use that explanation.] This class is useful for parsing XML or made-up markup languages, @@ -1276,6 +1289,7 @@ class BeautifulStoneSoup(Tag): except StopParsing: pass self.markup = None # The markup can now be GCed. + self.builder.close() self.builder.soup = None self.builder = None # So can the builder. diff --git a/lxml_builder.py b/lxml_builder.py new file mode 100644 index 0000000..95988e4 --- /dev/null +++ b/lxml_builder.py @@ -0,0 +1,36 @@ +from lxml import etree +from BeautifulSoup import TreeBuilder + +class LXMLBuilder(TreeBuilder): + + def __init__(self, parser_class=etree.XMLParser, self_closing_tags=[]): + self.parser = parser_class(target=self) + self.self_closing_tags = self_closing_tags + self.soup = None + + def isSelfClosingTag(self, name): + return name in self.self_closing_tags + + def feed(self, markup): + self.parser.feed(markup) + self.parser.close() + + def start(self, name, attrs): + self.soup.handle_starttag(name, attrs) + + def end(self, name): + self.soup.handle_endtag(name) + + def data(self, content): + self.soup.handle_data(content) + + def comment(self, content): + "Handle comments as Comment objects." + self._toStringSubclass(content, Comment) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.data(text) + self.soup.endData(subclass) diff --git a/lxml_test.py b/lxml_test.py new file mode 100644 index 0000000..99375dd --- /dev/null +++ b/lxml_test.py @@ -0,0 +1,13 @@ +from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup +from lxml_builder import LXMLBuilder +from lxml import etree +builder = LXMLBuilder() +soup = BeautifulStoneSoup("<foo>bar</foo>", builder=builder) +print soup.prettify() + +soup = BeautifulSoup("<foo>bar</foo>", builder=builder) +print soup.prettify() + +builder = LXMLBuilder(parser_class=etree.HTMLParser, self_closing_tags=["br"]) +soup = BeautifulSoup("<html><head><title>test<body><h1>page<br />title</h1>", builder=builder) +print soup.prettify() |