[utils] Add extract_attributes for extracting html tag attributes

This is much more robust than just using regexps, and handles all
the common scenarios, such as empty/no values, repeated attributes,
entity decoding, mixed case names, and the different possible value
quoting schemes.
This commit is contained in:
Brian Foley 2016-01-02 19:49:59 +00:00
parent 03879ff054
commit 8bb56eeeea
3 changed files with 76 additions and 0 deletions

View file

@ -77,6 +77,11 @@ try:
except ImportError: # Python 2
from urllib import urlretrieve as compat_urlretrieve
try:
from html.parser import HTMLParser as compat_HTMLParser
except ImportError: # Python 2
from HTMLParser import HTMLParser as compat_HTMLParser
try:
from subprocess import DEVNULL
@ -540,6 +545,7 @@ else:
from tokenize import generate_tokens as compat_tokenize_tokenize
__all__ = [
'compat_HTMLParser',
'compat_HTTPError',
'compat_basestring',
'compat_chr',