summaryrefslogtreecommitdiff
path: root/TODO
diff options
context:
space:
mode:
Diffstat (limited to 'TODO')
-rw-r--r--TODO45
1 files changed, 45 insertions, 0 deletions
diff --git a/TODO b/TODO
index 75f1ca6..208638d 100644
--- a/TODO
+++ b/TODO
@@ -6,3 +6,48 @@ Calculate tag.string dynamically rather than when creating the
tree. The html5lib builder doesn't use popTag, and adding/removing
things from the tree after the fact may also change the
value/availability of tag.string.
+
+---
+
+Here are some unit tests that fail with HTMLParser.
+
+ def testValidButBogusDeclarationFAILS(self):
+ self.assertSoupEquals('<! Foo >a', '<!Foo >a')
+
+ def testIncompleteDeclarationAtEndFAILS(self):
+ self.assertSoupEquals('a<!b')
+
+ def testIncompleteEntityAtEndFAILS(self):
+ self.assertSoupEquals('&lt;Hello&gt')
+
+ # This is not what the original author had in mind, but it's
+ # a legitimate interpretation of what they wrote.
+ self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""",
+ '<a href="foo&lt;/a&gt;, &lt;/a&gt;&lt;a href="></a>, <a href="bar">baz</a>')
+ # SGMLParser generates bogus parse events when attribute values
+ # contain embedded brackets, but at least Beautiful Soup fixes
+ # it up a little.
+ self.assertSoupEquals('<a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a>')
+ self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah',
+ """<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
+
+ invalidEntity = "foo&#bar;baz"
+ soup = BeautifulStoneSoup\
+ (invalidEntity,
+ convertEntities=htmlEnt)
+ self.assertEquals(str(soup), invalidEntity)
+
+
+Tag names that contain Unicode characters crash the parser:
+ def testUnicodeTagNamesFAILS(self):
+ self.assertSoupEquals("<デダ芻デダtext>2PM</デダ芻デダtext>")
+
+Here's the implementation of NavigableString.__unicode__:
+
+ def __unicode__(self):
+ return unicode(str(self))
+
+It converts the Unicode to a string, and then back to Unicode. I can't
+find any other way of turning an element of a Unicode subclass into a
+normal Unicode object. This is pretty bad and a better technique is
+welcome.