diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-01 14:28:47 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-01 14:28:47 -0500 |
commit | 3a5c0800bcffbcefc61ee17d9ba2d93b4c51f141 (patch) | |
tree | e46e0e474b65358434fb67a9c0255b384cf0f772 | |
parent | 234e8eec133adc886161dbfa7cccc56fe59ed127 (diff) |
LXML's HTML parser is pretty permissive.
-rw-r--r-- | README.txt | 104 | ||||
-rw-r--r-- | TODO | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 2 |
3 files changed, 4 insertions, 105 deletions
@@ -72,110 +72,6 @@ Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3 support to the finish line. Ezio Melotti is also to thank for greatly improving the HTML parser that comes with Python 3.2. -== Better method names == - -Methods and attributes have been renamed to comply with PEP 8. The old names -still work. Here are the renames: - - * replaceWith -> replace_with - * replaceWithChildren -> replace_with_children - * findAll -> find_all - * findAllNext -> find_all_next - * findAllPrevious -> find_all_previous - * findNext -> find_next - * findNextSibling -> find_next_sibling - * findNextSiblings -> find_next_siblings - * findParent -> find_parent - * findParents -> find_parents - * findPrevious -> find_previous - * findPreviousSibling -> find_previous_sibling - * findPreviousSiblings -> find_previous_siblings - * nextSibling -> next_sibling - * previousSibling -> previous_sibling - -Methods have been renamed for compatibility with Python 3. - - * Tag.has_key() -> Tag.has_attr() - - (This was misleading, anyway, because has_key() looked at - a tag's attributes and __in__ looked at a tag's contents.) - -Some attributes have also been renamed, mostly to avoid using words -that have meaning to Python, like "unicode" and "next": - - * Tag.isSelfClosing -> Tag.is_empty_element (backwards compatible) - * UnicodeDammit.unicode -> UnicodeDammit.unicode_markup - (not backwards compatible) - * Tag.next -> Tag.next_element (not backwards compatible) - * Tag.previous -> Tag.previous_element (not backwards compatible) - -So have some arguments to the Beautiful Soup constructor: - - * BeautifulSoup(parseOnlyThese=...) -> BeautifulSoup(parse_only=...) - * BeautifulSoup(fromEncoding=...) -> BeautifulSoup(from_encoding=...) - - You can use the old names, but you'll get a DeprecationError. - -== Generators are now properties == - -The generators have been given more sensible (and PEP 8-compliant) -names, and turned into properties: - - * childGenerator() -> children - * nextGenerator() -> next_elements - * nextSiblingGenerator() -> next_siblings - * previousGenerator() -> previous_elements - * previousSiblingGenerator() -> previous_siblings - * recursiveChildGenerator() -> descendants - * parentGenerator() -> parents - -So instead of this: - - for parent in tag.parentGenerator(): - ... - -You can write this: - - for parent in tag.parents: - ... - -(But the old code will still work.) - -Some of the generators used to yield None after they were done, and -then stop. That was a bug. Now, the generators just stop. - -There are two new generators, .strings and .stripped_strings. .strings -yields NavigableString objects, and .stripped_strings yields Python -strings that have had whitespace stripped. - -== tag.string is recursive == - -tag.string now operates recursively. If tag A contains a single tag B -and nothing else, then A.string is the same as B.string. So: - -<a><b>foo</b></a> - -The value of a.string used to be None, and now it's "foo". - -== Empty-element tags == - -Beautiful Soup's handling of empty-element tags (aka self-closing -tags) has been improved, especially when parsing XML. Previously you -had to explicitly specify a list of empty-element tags when parsing -XML. You can still do that, but if you don't, Beautiful Soup now -considers any empty tag to be an empty-element tag. - -The determination of empty-element-ness is now made at runtime rather -than parse time. If you add a child to an empty-element tag, it stops -being an empty-element tag. - -== Entities are always converted to Unicode == - -An HTML or XML entity is always converted into the corresponding -Unicode character. There are no longer any smartQuotesTo or -convertEntities arguments. (Unicode, Dammit still has smart_quotes_to, -but its default is now to turn smart quotes into Unicode.) - == CDATA sections are normal text, if they're understood at all. == Currently, the lxml and html5lib HTML parsers ignore CDATA sections in @@ -6,6 +6,9 @@ Bugs * Characters like & < > should always be converted to HTML entities on output, even if substitute_html_entities is False. +* html5lib doesn't support SoupStrainers, which is OK, but there + should be a warning about it. + Big features ------------ diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 07b2032..b330979 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -96,7 +96,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - features = [LXML, HTML, FAST] + features = [LXML, HTML, FAST, PERMISSIVE] is_xml = False @property |