diff options
-rw-r--r-- | CHANGELOG | 7 | ||||
-rw-r--r-- | bs4/element.py | 26 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 25 |
3 files changed, 48 insertions, 10 deletions
@@ -6,8 +6,11 @@ and position within a line (Tag.sourcepos). Based on code by Chris Mayo. [bug=1742921] -* Fixed a bug that made it impossible to pretty-print tags that were not - created during initial parsing. [bug=1838903] +* Fixed a crash when pretty-printing tags that were not created + during initial parsing. [bug=1838903] + +* Copying a Tag preserves information that was originally obtained from + the TreeBuilder used to build the original Tag. [bug=1838903] = 4.8.0 (20190720, "One Small Soup") diff --git a/bs4/element.py b/bs4/element.py index 658115d..24d504b 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -724,7 +724,10 @@ class Tag(PageElement): def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None, - is_xml=None, sourceline=None, sourcepos=None): + is_xml=None, sourceline=None, sourcepos=None, + can_be_empty_element=None, cdata_list_attributes=None, + preserve_whitespace_tags=None + ): "Basic constructor." if parser is None: @@ -765,11 +768,12 @@ class Tag(PageElement): self.hidden = False if builder is None: - # In the absence of a TreeBuilder, assume this tag is nothing - # special. - self.can_be_empty_element = False - self.cdata_list_attributes = None - self.preserve_whitespace_tags = None + # In the absence of a TreeBuilder, use whatever values were + # passed in here. They're probably None, unless this is a copy of some + # other tag. + self.can_be_empty_element = can_be_empty_element + self.cdata_list_attributes = cdata_list_attributes + self.preserve_whitespace_tags = preserve_whitespace_tags else: # Set up any substitutions for this tag, such as the charset in a META tag. builder.set_up_substitutions(self) @@ -797,8 +801,14 @@ class Tag(PageElement): """A copy of a Tag is a new Tag, unconnected to the parse tree. Its contents are a copy of the old Tag's contents. """ - clone = type(self)(None, self.builder, self.name, self.namespace, - self.prefix, self.attrs, is_xml=self._is_xml) + clone = type(self)( + None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml, + sourceline=self.sourceline, sourcepos=self.sourcepos, + can_be_empty_element=self.can_be_empty_element, + cdata_list_attributes=self.cdata_list_attributes, + preserve_whitespace_tags=self.preserve_whitespace_tags + ) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) for child in self.contents: diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index c995c46..7d8da01 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1490,6 +1490,31 @@ class TestPersistence(SoupTest): self.assertEqual(u"<p> </p>", unicode(copy)) self.assertEqual(encoding, copy.original_encoding) + def test_copy_preserves_builder_information(self): + + tag = self.soup('<p></p>').p + + # Simulate a tag obtained from a source file. + tag.sourceline = 10 + tag.sourcepos = 33 + + copied = tag.__copy__() + + # The TreeBuilder object is no longer availble, but information + # obtained from it gets copied over to the new Tag object. + self.assertEqual(tag.sourceline, copied.sourceline) + self.assertEqual(tag.sourcepos, copied.sourcepos) + self.assertEqual( + tag.can_be_empty_element, copied.can_be_empty_element + ) + self.assertEqual( + tag.cdata_list_attributes, copied.cdata_list_attributes + ) + self.assertEqual( + tag.preserve_whitespace_tags, copied.preserve_whitespace_tags + ) + + def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. html = u"<b>\N{SNOWMAN}</b>" |