summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-08-21 19:06:09 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-08-21 19:06:09 -0400
commit694a432f6ed1a62de742c7dfda0ed39ca4f2bb63 (patch)
tree57a73c5a8ec210a09046ab77c6687b300590ab8b /bs4/__init__.py
parentdb31fa66e82d097c2378734950bd3066acdae397 (diff)
When instantiating a BeautifulSoup object, it's now possible to
provide replacement classes to be instantiated for every tag ('tag_class') or string ('string_class') encountered during parsing, rather than using the default Tag and NavigableString objects.
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py29
1 files changed, 19 insertions, 10 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index e6efb38..4ec4785 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -63,7 +63,7 @@ class BeautifulSoup(Tag):
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
- endData(containerClass=NavigableString) # Ends the current data node
+ endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
@@ -78,14 +78,14 @@ class BeautifulSoup(Tag):
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
+
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
- **kwargs):
+ tag_class=Tag, string_class=NavigableString, **kwargs):
"""Constructor.
:param markup: A string or a file-like object representing
@@ -185,6 +185,9 @@ class BeautifulSoup(Tag):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
+ self.tag_class = tag_class
+ self.string_class = string_class
+
# We need this information to track whether or not the builder
# was specified well enough that we can omit the 'you need to
# specify a parser' warning.
@@ -381,11 +384,14 @@ class BeautifulSoup(Tag):
sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup."""
kwattrs.update(attrs)
- return Tag(None, self.builder, name, namespace, nsprefix, kwattrs,
- sourceline=sourceline, sourcepos=sourcepos)
+ return self.tag_class(
+ None, self.builder, name, namespace, nsprefix, kwattrs,
+ sourceline=sourceline, sourcepos=sourcepos
+ )
- def new_string(self, s, subclass=NavigableString):
+ def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this soup."""
+ subclass = subclass or self.string_class
return subclass(s)
def insert_before(self, successor):
@@ -412,7 +418,8 @@ class BeautifulSoup(Tag):
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
- def endData(self, containerClass=NavigableString):
+ def endData(self, containerClass=None):
+ containerClass = containerClass or self.string_class
if self.current_data:
current_data = u''.join(self.current_data)
# If whitespace is not preserved, and this string contains
@@ -551,9 +558,11 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))):
return None
- tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
- self.currentTag, self._most_recent_element,
- sourceline=sourceline, sourcepos=sourcepos)
+ tag = self.tag_class(
+ self, self.builder, name, namespace, nsprefix, attrs,
+ self.currentTag, self._most_recent_element,
+ sourceline=sourceline, sourcepos=sourcepos
+ )
if tag is None:
return tag
if self._most_recent_element is not None: