summaryrefslogtreecommitdiff
path: root/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder')
-rw-r--r--bs4/builder/__init__.py6
-rw-r--r--bs4/builder/_html5lib.py40
-rw-r--r--bs4/builder/_htmlparser.py3
-rw-r--r--bs4/builder/_lxml.py9
4 files changed, 48 insertions, 10 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index f8fce56..601979b 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -1,9 +1,13 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ HTMLAwareEntitySubstitution,
whitespace_re
)
@@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- preserve_whitespace_tags = set(['pre', 'textarea'])
+ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 8725a65..c46f882 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -1,8 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
__all__ = [
'HTML5TreeBuilder',
]
-from pdb import set_trace
import warnings
from bs4.builder import (
PERMISSIVE,
@@ -23,6 +25,15 @@ from bs4.element import (
Tag,
)
+try:
+ # Pre-0.99999999
+ from html5lib.treebuilders import _base as treebuilder_base
+ new_html5lib = False
+except ImportError, e:
+ # 0.99999999 and up
+ from html5lib.treebuilders import base as treebuilder_base
+ new_html5lib = True
+
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
@@ -47,7 +58,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
- doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+ extra_kwargs = dict()
+ if not isinstance(markup, unicode):
+ if new_html5lib:
+ extra_kwargs['override_encoding'] = self.user_specified_encoding
+ else:
+ extra_kwargs['encoding'] = self.user_specified_encoding
+ doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
@@ -55,7 +73,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
- doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+ original_encoding = parser.tokenizer.stream.charEncoding[0]
+ if not isinstance(original_encoding, basestring):
+ # In 0.99999999 and up, the encoding is an html5lib
+ # Encoding object. We want to use a string for compatibility
+ # with other tree builders.
+ original_encoding = original_encoding.name
+ doc.original_encoding = original_encoding
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
@@ -67,7 +91,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
return u'<html><head></head><body>%s</body></html>' % fragment
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
@@ -105,7 +129,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return self.soup
def getFragment(self):
- return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+ return treebuilder_base.TreeBuilder.getFragment(self).element
class AttrList(object):
def __init__(self, element):
@@ -137,9 +161,9 @@ class AttrList(object):
return name in list(self.attrs.keys())
-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace):
- html5lib.treebuilders._base.Node.__init__(self, element.name)
+ treebuilder_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
@@ -324,7 +348,7 @@ class Element(html5lib.treebuilders._base.Node):
class TextNode(Element):
def __init__(self, element, soup):
- html5lib.treebuilders._base.Node.__init__(self, None)
+ treebuilder_base.Node.__init__(self, None)
self.element = element
self.soup = soup
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 0101d64..823ca15 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -1,5 +1,8 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
__all__ = [
'HTMLParserTreeBuilder',
]
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 9e8f88f..4495bb9 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -1,3 +1,5 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
@@ -12,6 +14,7 @@ from bs4.element import (
Doctype,
NamespacedAttribute,
ProcessingInstruction,
+ XMLProcessingInstruction,
)
from bs4.builder import (
FAST,
@@ -103,6 +106,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
+ if is_html:
+ self.processing_instruction_class = ProcessingInstruction
+ else:
+ self.processing_instruction_class = XMLProcessingInstruction
try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
@@ -201,7 +208,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def pi(self, target, data):
self.soup.endData()
self.soup.handle_data(target + ' ' + data)
- self.soup.endData(ProcessingInstruction)
+ self.soup.endData(self.processing_instruction_class)
def data(self, content):
self.soup.handle_data(content)