diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-21 12:18:17 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-21 12:18:17 -0400 |
commit | febd6cea8cdd665a6f13c7040c3ff7d60d350e41 (patch) | |
tree | a2bb8f95277df3b41b2ba93f7a834b132079e842 /bs4/builder/_htmlparser.py | |
parent | 5cc46e31a63a4c0f2ae0109bc0216e91c4241766 (diff) |
Fixed a problem where the html.parser tree builder interpreted
a string like '&foo ' as the character entity '&foo;' [bug=1728706]
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ef9fd1e..eff30ff 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -152,7 +152,12 @@ class BeautifulSoupHTMLParser(HTMLParser): if character is not None: data = character else: - data = "&%s;" % name + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name self.handle_data(data) def handle_comment(self, data): |