diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-07-21 14:58:16 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-07-21 14:58:16 -0400 |
commit | b2294f4f05d9e8583613560986f8aa64b18866b9 (patch) | |
tree | 5af13a59eca15ea082cb46ea286bc9c5b91996da | |
parent | 819fa4255063d6b8d16f62469afa6c6e504f284a (diff) |
Adapt Chris Mayo's code to track line number and position when using html.parser.
-rw-r--r-- | CHANGELOG | 7 | ||||
-rw-r--r-- | bs4/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 24 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 9 | ||||
-rw-r--r-- | bs4/element.py | 6 | ||||
-rw-r--r-- | bs4/testing.py | 17 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 11 | ||||
-rw-r--r-- | doc/source/index.rst | 14 |
8 files changed, 87 insertions, 7 deletions
@@ -1,3 +1,10 @@ += Unreleased + +* When the html.parser is in use, Beautiful Soup will, by default, + record the position in the original document where each tag was + encountered. This includes line number (Tag.lineno) and position + within a line (Tag.position). Based on code by Chris Mayo. + = 4.8.0 (20190720, "One Small Soup") This release focuses on making it easier to customize Beautiful Soup's diff --git a/bs4/__init__.py b/bs4/__init__.py index 9cd01c8..753aa73 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -531,7 +531,8 @@ class BeautifulSoup(Tag): return most_recently_popped - def handle_starttag(self, name, namespace, nsprefix, attrs): + def handle_starttag(self, name, namespace, nsprefix, attrs, lineno=None, + offset=None): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the @@ -549,7 +550,8 @@ class BeautifulSoup(Tag): return None tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self._most_recent_element) + self.currentTag, self._most_recent_element, lineno=lineno, + offset=offset) if tag is None: return tag if self._most_recent_element is not None: diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e087f07..e28242b 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -99,8 +99,13 @@ class TreeBuilder(object): DEFAULT_PRESERVE_WHITESPACE_TAGS = set() USE_DEFAULT = object() + + # Most parsers don't keep track of line numbers. + TRACKS_LINE_NUMBERS = False - def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + def __init__(self, multi_valued_attributes=USE_DEFAULT, + preserve_whitespace_tags=USE_DEFAULT, + store_line_numbers=USE_DEFAULT): """Constructor. :param multi_valued_attributes: If this is set to None, the @@ -113,7 +118,17 @@ class TreeBuilder(object): probably doesn't make sense to an end-user, so the argument name is `multi_valued_attributes`. - :param preserve_whitespace_tags: + :param preserve_whitespace_tags: A list of tags to treat + the way <pre> tags are treated in HTML. Tags in this list + will have + + :param store_line_numbers: If the parser keeps track of the + line numbers and positions of the original markup, that + information will, by default, be stored in each corresponding + `Tag` object. You can turn this off by passing + store_line_numbers=False. If the parser you're using doesn't + keep track of this information, then setting store_line_numbers=True + will do nothing. """ self.soup = None if multi_valued_attributes is self.USE_DEFAULT: @@ -122,7 +137,10 @@ class TreeBuilder(object): if preserve_whitespace_tags is self.USE_DEFAULT: preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS self.preserve_whitespace_tags = preserve_whitespace_tags - + if store_line_numbers == self.USE_DEFAULT: + store_line_numbers = self.TRACKS_LINE_NUMBERS + self.store_line_numbers = store_line_numbers + def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now being associated with the TreeBuilder. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 56b8b91..e2c87c1 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -99,7 +99,10 @@ class BeautifulSoupHTMLParser(HTMLParser): attr_dict[key] = value attrvalue = '""' #print "START", name - tag = self.soup.handle_starttag(name, None, None, attr_dict) + lineno, offset = self.getpos() + tag = self.soup.handle_starttag( + name, None, None, attr_dict, lineno=lineno, offset=offset + ) if tag and tag.is_empty_element and handle_empty_element: # Unlike other parsers, html.parser doesn't send separate end tag # events for empty-element tags. (It's handled in @@ -214,6 +217,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): NAME = HTMLPARSER features = [NAME, HTML, STRICT] + # The html.parser knows which line number and position in the + # original file is the source of a document. + TRACKS_LINE_NUMBERS = True + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): super(HTMLParserTreeBuilder, self).__init__(**kwargs) parser_args = parser_args or [] diff --git a/bs4/element.py b/bs4/element.py index 73e3867..41acf45 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -724,7 +724,7 @@ class Tag(PageElement): def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None, - is_xml=None): + is_xml=None, lineno=None, offset=None): "Basic constructor." if parser is None: @@ -738,6 +738,10 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix + if ((not builder or builder.store_line_numbers) + and (lineno is not None or offset is not None)): + self.lineno = lineno + self.offset = offset if attrs is None: attrs = {} elif attrs: diff --git a/bs4/testing.py b/bs4/testing.py index 9f12e8d..3e8d15b 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -790,6 +790,23 @@ Hello, world! soup = self.soup(BAD_DOCUMENT) self.linkage_validator(soup) + def test_tracking_line_numbers(self): + # In general, TreeBuilders do not keep track of + # line numbers from the original markup. Even if you + # ask for line numbers, we don't have 'em. + # + # This means that if you have a tag like <lineno> or <offset>, + # tag.lineno will find it rather than giving you a numeric + # answer. + # + # See HTMLParserTreeBuilderSmokeTest for a situation + # where the parser _does_ keep track of the line numbers. + soup = self.soup( + "\n <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>", + store_line_numbers=True + ) + self.assertEqual("lineno", soup.p.lineno.name) + self.assertEqual("offset", soup.p.offset.name) class XMLTreeBuilderSmokeTest(object): diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 790489a..c6a6691 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -37,6 +37,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): # finishes working is handled. self.assertSoupEquals("foo &# bar", "foo &# bar") + def test_tracking_line_numbers(self): + # Unlike other TreeBuilders, the html.parser TreeBuilder + # keeps track of line number and position of each element. + soup = self.soup( + "\n <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>", + store_line_numbers=True + ) + self.assertEqual(2, soup.p.lineno) + self.assertEqual(3, soup.p.offset) + self.assertEqual("lineno", soup.p.find('lineno').name) + class TestHTMLParserSubclass(SoupTest): def test_error(self): diff --git a/doc/source/index.rst b/doc/source/index.rst index 0c94d6a..69976fe 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2495,6 +2495,20 @@ machines, you should specify a parser in the ``BeautifulSoup`` constructor. That will reduce the chances that your users parse a document differently from the way you parse it. +Line numbers +------------ + +The html.parser parser will keep track of where in the original +document it found each Tag. You can access this information as +``Tag.lineno`` (line number) and ``Tag.offset`` (position of the start +tag within a line):: + + soup = BeautifulSoup("<p>Paragraph 1</p>\n <p>Paragraph 2</p>", 'html.parser') + for tag in soup.find_all('p'): + print(tag.lineno, tag.offset, tag.string) + # (1, 0, u'Paragraph 1') + # (2, 3, u'Paragraph 2') + Encodings ========= |