summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/builder/_htmlparser.py7
-rw-r--r--bs4/testing.py8
2 files changed, 14 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ef9fd1e..eff30ff 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -152,7 +152,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
- data = "&%s;" % name
+ # If this were XML, it would be ambiguous whether "&foo"
+ # was an character entity reference with a missing
+ # semicolon or the literal string "&foo". Since this is
+ # HTML, we have a complete list of all character entity references,
+ # and this one wasn't found, so assume it's the literal string "&foo".
+ data = "&%s" % name
self.handle_data(data)
def handle_comment(self, data):
diff --git a/bs4/testing.py b/bs4/testing.py
index 5b0eb8f..bbcc271 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -319,6 +319,14 @@ Hello, world!
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
+ def test_strings_resembling_character_entity_references(self):
+ # "&T" and "&p" look like incomplete character entities, but they are
+ # not.
+ self.assertSoupEquals(
+ u"<p>&bull; AT&T is in the s&p 500</p>",
+ u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
+ )
+
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)