Created
September 28, 2017 01:44
-
-
Save simon-liu/3eb6d4f9b9668f8f57c027a93ff8d105 to your computer and use it in GitHub Desktop.
html parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from html.parser import HTMLParser as HTMLParserBase | |
| class HTMLParser(HTMLParserBase): | |
| def __init__(self, *args, **kwargs): | |
| super(HTMLParser, self).__init__(*args, **kwargs) | |
| self.title = '' | |
| self.catch_title = False | |
| self.content = '' | |
| def handle_starttag(self, tag, attrs): | |
| if tag == 'title': | |
| self.catch_title = True | |
| else: | |
| self.catch_title = False | |
| def handle_data(self, data): | |
| if self.catch_title and not self.title: | |
| self.title = data | |
| self.content += '\n' + data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment