Fixed a problem where the html.parser tree builder interpreted

a string like '&foo ' as the character entity '&foo;' [bug=1728706]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-21 12:18:17 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-21 12:18:17 -0400
commit: febd6cea8cdd665a6f13c7040c3ff7d60d350e41 (patch)
tree: a2bb8f95277df3b41b2ba93f7a834b132079e842 /bs4/builder/_htmlparser.py
parent: 5cc46e31a63a4c0f2ae0109bc0216e91c4241766 (diff)
1 files changed, 6 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ef9fd1e..eff30ff 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -152,7 +152,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
         if character is not None:
             data = character
         else:
-            data = "&%s;" % name
+            # If this were XML, it would be ambiguous whether "&foo"
+            # was an character entity reference with a missing
+            # semicolon or the literal string "&foo". Since this is
+            # HTML, we have a complete list of all character entity references,
+            # and this one wasn't found, so assume it's the literal string "&foo".
+            data = "&%s" % name
         self.handle_data(data)
 
     def handle_comment(self, data):
author	Leonard Richardson <leonardr@segfault.org>	2018-07-21 12:18:17 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-21 12:18:17 -0400
commit	febd6cea8cdd665a6f13c7040c3ff7d60d350e41 (patch)
tree	a2bb8f95277df3b41b2ba93f7a834b132079e842 /bs4/builder/_htmlparser.py
parent	5cc46e31a63a4c0f2ae0109bc0216e91c4241766 (diff)