[utils] Add extract_attributes for extracting html tag attributes

This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes.
2016-01-02 19:49:59 +00:00 · 2016-01-02 19:49:59 +00:00 · 8bb56eeeea
commit 8bb56eeeea
parent 03879ff054
3 changed files with 76 additions and 0 deletions
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -35,6 +35,7 @@ import xml.etree.ElementTree
 import zlib

 from .compat import (
+    compat_HTMLParser,
    compat_basestring,
    compat_chr,
    compat_etree_fromstring,
@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html):

    return unescapeHTML(res)

+class HTMLAttributeParser(compat_HTMLParser):
+    """Trivial HTML parser to gather the attributes for a single element"""
+    def __init__(self):
+        self.attrs = { }
+        compat_HTMLParser.__init__(self)
+
+    def handle_starttag(self, tag, attrs):
+        self.attrs = dict(attrs)
+
+def extract_attributes(html_element):
+    """Given a string for an HTML element such as
+    <el
+         a="foo" B="bar" c="&98;az" d=boz
+         empty= noval entity="&amp;"
+         sq='"' dq="'"
+    >
+    Decode and return a dictionary of attributes.
+    {
+        'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+        'empty': '', 'noval': None, 'entity': '&',
+        'sq': '"', 'dq': '\''
+    }.
+    NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
+    but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
+    """
+    parser = HTMLAttributeParser()
+    parser.feed(html_element)
+    parser.close()
+    return parser.attrs

 def clean_html(html):
    """Clean an HTML snippet into a readable string"""