diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 14 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 45 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 7 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 6 | ||||
-rw-r--r-- | bs4/element.py | 8 | ||||
-rw-r--r-- | bs4/testing.py | 17 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 14 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 25 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 15 |
9 files changed, 102 insertions, 49 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 753aa73..e6efb38 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -377,10 +377,12 @@ class BeautifulSoup(Tag): self.preserve_whitespace_tag_stack = [] self.pushTag(self) - def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, + sourceline=None, sourcepos=None, **kwattrs): """Create a new tag associated with this soup.""" kwattrs.update(attrs) - return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) + return Tag(None, self.builder, name, namespace, nsprefix, kwattrs, + sourceline=sourceline, sourcepos=sourcepos) def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" @@ -531,8 +533,8 @@ class BeautifulSoup(Tag): return most_recently_popped - def handle_starttag(self, name, namespace, nsprefix, attrs, lineno=None, - offset=None): + def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, + sourcepos=None): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the @@ -550,8 +552,8 @@ class BeautifulSoup(Tag): return None tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self._most_recent_element, lineno=lineno, - offset=offset) + self.currentTag, self._most_recent_element, + sourceline=sourceline, sourcepos=sourcepos) if tag is None: return tag if self._most_recent_element is not None: diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 6892a93..13f697c 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -45,6 +45,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder): features = [NAME, PERMISSIVE, HTML_5, HTML] + # html5lib can tell us which line number and position in the + # original file is the source of an element. + TRACKS_LINE_NUMBERS = True + def prepare_markup(self, markup, user_specified_encoding, document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. @@ -62,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) - + self.underlying_builder.parser = parser extra_kwargs = dict() if not isinstance(markup, unicode): if new_html5lib: @@ -70,7 +74,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) - + # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets @@ -84,10 +88,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding - + self.underlying_builder.parser = None + def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( - namespaceHTMLElements, self.soup) + namespaceHTMLElements, self.soup, + store_line_numbers=self.store_line_numbers + ) return self.underlying_builder def test_fragment_to_document(self, fragment): @@ -96,15 +103,26 @@ class HTML5TreeBuilder(HTMLTreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): - - def __init__(self, namespaceHTMLElements, soup=None): + + def __init__(self, namespaceHTMLElements, soup=None, + store_line_numbers=True, **kwargs): if soup: self.soup = soup else: from bs4 import BeautifulSoup - self.soup = BeautifulSoup("", "html.parser") + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? + self.soup = BeautifulSoup( + "", "html.parser", store_line_numbers=store_line_numbers, + **kwargs + ) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + # This will be set later to an html5lib.html5parser.HTMLParser + # object, which we can use to track the current line number. + self.parser = None + self.store_line_numbers = store_line_numbers + def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) @@ -118,7 +136,16 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): - tag = self.soup.new_tag(name, namespace) + kwargs = {} + if self.parser and self.store_line_numbers: + # This represents the point immediately after the end of the + # tag. We don't know when the tag started, but we do know + # where it ended -- the character just before this one. + sourceline, sourcepos = self.parser.tokenizer.stream.position() + kwargs['sourceline'] = sourceline + kwargs['sourcepos'] = sourcepos-1 + tag = self.soup.new_tag(name, namespace, **kwargs) + return Element(tag, self.soup, namespace) def commentClass(self, data): @@ -126,6 +153,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def fragmentClass(self): from bs4 import BeautifulSoup + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index e2c87c1..cd50eb0 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -99,9 +99,10 @@ class BeautifulSoupHTMLParser(HTMLParser): attr_dict[key] = value attrvalue = '""' #print "START", name - lineno, offset = self.getpos() + sourceline, sourcepos = self.getpos() tag = self.soup.handle_starttag( - name, None, None, attr_dict, lineno=lineno, offset=offset + name, None, None, attr_dict, sourceline=sourceline, + sourcepos=sourcepos ) if tag and tag.is_empty_element and handle_empty_element: # Unlike other parsers, html.parser doesn't send separate end tag @@ -218,7 +219,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): features = [NAME, HTML, STRICT] # The html.parser knows which line number and position in the - # original file is the source of a document. + # original file is the source of an element. TRACKS_LINE_NUMBERS = True def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 27cadcb..85be1b5 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -57,6 +57,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + # NOTE: If we parsed Element objects and looked at .sourceline, + # we'd be able to see the line numbers from the original document. + # But instead we build an XMLParser or HTMLParser object to serve + # as the target of parse messages, and those messages don't include + # line numbers. + def initialize_soup(self, soup): """Let the BeautifulSoup object know about the standard namespace mapping. diff --git a/bs4/element.py b/bs4/element.py index 41acf45..a610008 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -724,7 +724,7 @@ class Tag(PageElement): def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None, - is_xml=None, lineno=None, offset=None): + is_xml=None, sourceline=None, sourcepos=None): "Basic constructor." if parser is None: @@ -739,9 +739,9 @@ class Tag(PageElement): self.namespace = namespace self.prefix = prefix if ((not builder or builder.store_line_numbers) - and (lineno is not None or offset is not None)): - self.lineno = lineno - self.offset = offset + and (sourceline is not None or sourcepos is not None)): + self.sourceline = sourceline + self.sourcepos = sourcepos if attrs is None: attrs = {} elif attrs: diff --git a/bs4/testing.py b/bs4/testing.py index 3e8d15b..9f12e8d 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -790,23 +790,6 @@ Hello, world! soup = self.soup(BAD_DOCUMENT) self.linkage_validator(soup) - def test_tracking_line_numbers(self): - # In general, TreeBuilders do not keep track of - # line numbers from the original markup. Even if you - # ask for line numbers, we don't have 'em. - # - # This means that if you have a tag like <lineno> or <offset>, - # tag.lineno will find it rather than giving you a numeric - # answer. - # - # See HTMLParserTreeBuilderSmokeTest for a situation - # where the parser _does_ keep track of the line numbers. - soup = self.soup( - "\n <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>", - store_line_numbers=True - ) - self.assertEqual("lineno", soup.p.lineno.name) - self.assertEqual("offset", soup.p.offset.name) class XMLTreeBuilderSmokeTest(object): diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 371463a..6446f84 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -168,3 +168,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): for form in soup.find_all('form'): inputs.extend(form.find_all('input')) self.assertEqual(len(inputs), 1) + + def test_tracking_line_numbers(self): + # The html.parser TreeBuilder keeps track of line number and + # position of each element. + markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>" + soup = self.soup(markup) + self.assertEqual(2, soup.p.sourceline) + self.assertEqual(5, soup.p.sourcepos) + self.assertEqual("sourceline", soup.p.find('sourceline').name) + + # You can deactivate this behavior. + soup = self.soup(markup, store_line_numbers=False) + self.assertEqual("sourceline", soup.p.sourceline.name) + self.assertEqual("sourcepos", soup.p.sourcepos.name) diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index c6a6691..7be6493 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -38,17 +38,20 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals("foo &# bar", "foo &# bar") def test_tracking_line_numbers(self): - # Unlike other TreeBuilders, the html.parser TreeBuilder - # keeps track of line number and position of each element. - soup = self.soup( - "\n <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>", - store_line_numbers=True - ) - self.assertEqual(2, soup.p.lineno) - self.assertEqual(3, soup.p.offset) - self.assertEqual("lineno", soup.p.find('lineno').name) - - + # The html.parser TreeBuilder keeps track of line number and + # position of each element. + markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>" + soup = self.soup(markup) + self.assertEqual(2, soup.p.sourceline) + self.assertEqual(3, soup.p.sourcepos) + self.assertEqual("sourceline", soup.p.find('sourceline').name) + + # You can deactivate this behavior. + soup = self.soup(markup, store_line_numbers=False) + self.assertEqual("sourceline", soup.p.sourceline.name) + self.assertEqual("sourcepos", soup.p.sourcepos.name) + + class TestHTMLParserSubclass(SoupTest): def test_error(self): """Verify that our HTMLParser subclass implements error() in a way diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 3b7858f..f96e4ae 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -71,6 +71,21 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertEqual(u"<b/>", unicode(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) + def test_tracking_line_numbers(self): + # The lxml TreeBuilder cannot keep track of line numbers from + # the original markup. Even if you ask for line numbers, we + # don't have 'em. + # + # This means that if you have a tag like <sourceline> or + # <sourcepos>, attribute access will find it rather than + # giving you a numeric answer. + soup = self.soup( + "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>", + store_line_numbers=True + ) + self.assertEqual("sourceline", soup.p.sourceline.name) + self.assertEqual("sourcepos", soup.p.sourcepos.name) + @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") |