diff options
-rw-r--r-- | bs4/builder/_lxml.py | 2 | ||||
-rw-r--r-- | bs4/diagnose.py | 38 |
2 files changed, 39 insertions, 1 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index f718ed1..8638c59 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -65,7 +65,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): :return: A 3-tuple (markup, original encoding, encoding declared within markup). """ - if isinstance(markup, unicode): + if isinstance(markup, unicode) or True: return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] diff --git a/bs4/diagnose.py b/bs4/diagnose.py index e336633..580bcff 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -4,6 +4,7 @@ from HTMLParser import HTMLParser from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry import os +import random import traceback import sys @@ -101,5 +102,42 @@ def htmlparser_trace(data): parser = AnnouncingParser() parser.feed(data) +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("</%s>" % tag_name) + return "\n".join(elements) + + if __name__ == '__main__': diagnose(sys.stdin.read()) |