[utils] Add extract_attributes for extracting html tag attributes

This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes.
2016-01-02 19:49:59 +00:00 · 2016-01-02 19:49:59 +00:00 · 8bb56eeeea
commit 8bb56eeeea
parent 03879ff054
3 changed files with 76 additions and 0 deletions
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -77,6 +77,11 @@ try:
 except ImportError:  # Python 2
    from urllib import urlretrieve as compat_urlretrieve

+try:
+    from html.parser import HTMLParser as compat_HTMLParser
+except ImportError:  # Python 2
+    from HTMLParser import HTMLParser as compat_HTMLParser
+

 try:
    from subprocess import DEVNULL
@ -540,6 +545,7 @@ else:
    from tokenize import generate_tokens as compat_tokenize_tokenize

 __all__ = [
+    'compat_HTMLParser',
    'compat_HTTPError',
    'compat_basestring',
    'compat_chr',