"""Tests to ensure that the lxml tree builder generates good trees.""" from beautifulsoup import BeautifulSoup from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder from beautifulsoup.element import Comment from beautifulsoup.testing import SoupTest class TestLXMLBuilder(SoupTest): """A smoke test for the LXML tree builders. Subclass this to test some other tree builder. Subclasses of this test ensure that all of Beautiful Soup's tree builders generate more or less the same trees. It's okay for trees to differ, especially when given invalid markup--just override the appropriate test method to demonstrate how one tree builder differs from the LXML builder. """ def test_bare_string(self): # A bare string is turned into some kind of HTML document or # fragment recognizable as the original string. # # In this case, lxml puts a

tag around the bare string. self.assertSoupEquals( "A bare string", "

A bare string

") def test_mixed_case_tags(self): # Mixed-case tags are folded to lowercase. self.assertSoupEquals( "", "") def test_self_closing(self): # HTML's self-closing tags are recognized as such. self.assertSoupEquals( "

A tag

", "

A tag

") self.assertSoupEquals( "

Foo
bar

", "

Foo
bar

") def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" self.assertSoupEquals(markup) soup = self.soup(markup) comment = soup.find(text="foobar") self.assertEquals(comment.__class__, Comment) def test_nested_inline_elements(self): # Inline tags can be nested indefinitely. b_tag = "Inside a B tag" self.assertSoupEquals(b_tag) nested_b_tag = "

A nested tag

" self.assertSoupEquals(nested_b_tag) double_nested_b_tag = "

A doubly nested tag

" self.assertSoupEquals(nested_b_tag) def test_nested_block_level_elements(self): soup = self.soup('

Foo

') blockquote = soup.blockquote self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') def test_collapsed_whitespace(self): """In most tags, whitespace is collapsed.""" self.assertSoupEquals("

", "

") def test_preserved_whitespace_in_pre_and_textarea(self): """In

 and  tags, whitespace is preserved."""
        self.assertSoupEquals("<pre>   </pre>")
        self.assertSoupEquals("<textarea> woo  ")

    def test_single_quote_attribute_values_become_double_quotes(self):
        self.assertSoupEquals("",
                              '')

    def test_attribute_values_with_nested_quotes_are_left_alone(self):
        text = """a"""
        self.assertSoupEquals(text)

    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
        text = """a"""
        soup = self.soup(text)
        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
        self.assertSoupEquals(
            soup.foo.decode(),
            """a""")

    def test_ampersand_in_attribute_value_gets_quoted(self):
        self.assertSoupEquals('',
                              '')

    def test_literal_in_textarea(self):
        # Anything inside a  is supposed to be treated as
        # the literal value of the field, (XXX citation needed).
        #
        # But, both lxml and html5lib do their best to parse the
        # contents of a <textarea> as HTML.
        text = '<textarea>Junk like <b> tags and <&<&'
        soup = BeautifulSoup(text)
        self.assertEquals(len(soup.textarea.contents), 2)
        self.assertEquals(soup.textarea.contents[0], u"Junk like ")
        self.assertEquals(soup.textarea.contents[1].name, 'b')
        self.assertEquals(soup.textarea.b.string, u" tags and ")

    def test_literal_in_script(self):
        # The contents of a ' % javascript)
        self.assertEquals(soup.script.string, javascript)

    def test_naked_ampersands(self):
        # Ampersands are left alone.
        text = "AT&T"
        soup = self.soup(text)
        self.assertEquals(soup.p.string, "AT&T")

        # Even if they're in attribute values.
        invalid_url = 'foo'
        soup = self.soup(invalid_url)
        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")

    def test_entities_in_strings_converted_during_parsing(self):
        # Both XML and HTML entities are converted to Unicode characters
        # during parsing.
        text = "<<sacré bleu!>>"
        expected = u"<>"
        self.assertSoupEquals(text, expected)

    def test_entities_in_attribute_values_converted_during_parsing(self):
        text = ''
        expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata"
        soup = self.soup(text)
        self.assertEquals(soup.x['t'], expected)

        text = ''
        soup = self.soup(text)
        self.assertEquals(soup.x['t'], expected)

        text = ''
        soup = self.soup(text)
        self.assertEquals(
            soup.x['t'],
            u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu")

        # This can cause valid HTML to become invalid.
        valid_url = 'foo'
        soup = self.soup(valid_url)
        self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3")

    def test_smart_quotes_converted_on_the_way_in(self):
        # Microsoft smart quotes are converted to Unicode characters during
        # parsing.
        quote = "\x91Foo\x92"
        soup = self.soup(quote)
        self.assertEquals(
            soup.p.string,
            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")

    def test_non_breaking_spaces_converted_on_the_way_in(self):
        soup = self.soup("  ")
        self.assertEquals(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)

    # Tests below this line need work.

    def test_entities_converted_on_the_way_out(self):
        text = "<<sacré bleu!>>"
        expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8")
        soup = BeautifulSoup(text)
        str = soup.p.string
        #self.assertEquals(str.encode("utf-8"), expected)

    def test_foo(self):
        isolatin = """Sacr\xe9 bleu!"""
        soup = self.soup(isolatin)

        utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
        utf8 = utf8.replace("\xe9", "\xc3\xa9")
        #print soup


class TestLXMLBuilderInvalidMarkup(SoupTest):
    """Tests of invalid markup for the LXML tree builder.

    Subclass this to test other builders.

    These are very likely to give different results for different tree
    builders. It's not required that a tree builder handle invalid
    markup at all.
    """

    def test_unclosed_block_level_elements(self):
        # Unclosed block-level elements should be closed.
        self.assertSoupEquals(
            'Foo
Bar',
            '
Foo
Bar
')

    def test_fake_self_closing_tag(self):
        # If a self-closing tag presents as a normal tag, the 'open'
        # tag is treated as an instance of the self-closing tag and
        # the 'close' tag is ignored.
        self.assertSoupEquals(
            "http://foo.com/",
            "http://foo.com/")

    def test_boolean_attribute_with_no_value_gets_empty_value(self):
        soup = self.soup("foo")
        self.assertEquals(soup.table.td['nowrap'], '')

    def test_incorrectly_nested_tables(self):
        self.assertSoupEquals(
            '',
            '
')