diff options
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/__init__.py | 24 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 9 |
2 files changed, 29 insertions, 4 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e087f07..e28242b 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -99,8 +99,13 @@ class TreeBuilder(object): DEFAULT_PRESERVE_WHITESPACE_TAGS = set() USE_DEFAULT = object() + + # Most parsers don't keep track of line numbers. + TRACKS_LINE_NUMBERS = False - def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + def __init__(self, multi_valued_attributes=USE_DEFAULT, + preserve_whitespace_tags=USE_DEFAULT, + store_line_numbers=USE_DEFAULT): """Constructor. :param multi_valued_attributes: If this is set to None, the @@ -113,7 +118,17 @@ class TreeBuilder(object): probably doesn't make sense to an end-user, so the argument name is `multi_valued_attributes`. - :param preserve_whitespace_tags: + :param preserve_whitespace_tags: A list of tags to treat + the way <pre> tags are treated in HTML. Tags in this list + will have + + :param store_line_numbers: If the parser keeps track of the + line numbers and positions of the original markup, that + information will, by default, be stored in each corresponding + `Tag` object. You can turn this off by passing + store_line_numbers=False. If the parser you're using doesn't + keep track of this information, then setting store_line_numbers=True + will do nothing. """ self.soup = None if multi_valued_attributes is self.USE_DEFAULT: @@ -122,7 +137,10 @@ class TreeBuilder(object): if preserve_whitespace_tags is self.USE_DEFAULT: preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS self.preserve_whitespace_tags = preserve_whitespace_tags - + if store_line_numbers == self.USE_DEFAULT: + store_line_numbers = self.TRACKS_LINE_NUMBERS + self.store_line_numbers = store_line_numbers + def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now being associated with the TreeBuilder. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 56b8b91..e2c87c1 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -99,7 +99,10 @@ class BeautifulSoupHTMLParser(HTMLParser): attr_dict[key] = value attrvalue = '""' #print "START", name - tag = self.soup.handle_starttag(name, None, None, attr_dict) + lineno, offset = self.getpos() + tag = self.soup.handle_starttag( + name, None, None, attr_dict, lineno=lineno, offset=offset + ) if tag and tag.is_empty_element and handle_empty_element: # Unlike other parsers, html.parser doesn't send separate end tag # events for empty-element tags. (It's handled in @@ -214,6 +217,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): NAME = HTMLPARSER features = [NAME, HTML, STRICT] + # The html.parser knows which line number and position in the + # original file is the source of a document. + TRACKS_LINE_NUMBERS = True + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): super(HTMLParserTreeBuilder, self).__init__(**kwargs) parser_args = parser_args or [] |