merged unescapeHTML branch; removed lxml dependency

2012-04-11 00:22:51 +02:00 · 2012-04-11 00:22:51 +02:00 · 9e6dd23876
commit 9e6dd23876
parent d11d05d07a 7a8501e307
5 changed files with 92 additions and 57 deletions
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -74,9 +74,86 @@ def htmlentity_transform(matchobj):
 	return (u'&%s;' % entity)


+class IDParser(HTMLParser.HTMLParser):
+	"""Modified HTMLParser that isolates a tag with the specified id"""
+	def __init__(self, id):
+		self.id = id
+		self.result = None
+		self.started = False
+		self.depth = {}
+		self.html = None
+		self.watch_startpos = False
+		HTMLParser.HTMLParser.__init__(self)
+
+	def loads(self, html):
+		self.html = html
+		self.feed(html)
+		self.close()
+
+	def handle_starttag(self, tag, attrs):
+		attrs = dict(attrs)
+		if self.started:
+			self.find_startpos(None)
+		if 'id' in attrs and attrs['id'] == self.id:
+			self.result = [tag]
+			self.started = True
+			self.watch_startpos = True
+		if self.started:
+			if not tag in self.depth: self.depth[tag] = 0
+			self.depth[tag] += 1
+
+	def handle_endtag(self, tag):
+		if self.started:
+			if tag in self.depth: self.depth[tag] -= 1
+			if self.depth[self.result[0]] == 0:
+				self.started = False
+				self.result.append(self.getpos())
+
+	def find_startpos(self, x):
+		"""Needed to put the start position of the result (self.result[1])
+		after the opening tag with the requested id"""
+		if self.watch_startpos:
+			self.watch_startpos = False
+			self.result.append(self.getpos())
+	handle_entityref = handle_charref = handle_data = handle_comment = \
+	handle_decl = handle_pi = unknown_decl = find_startpos
+
+	def get_result(self):
+		if self.result == None: return None
+		if len(self.result) != 3: return None
+		lines = self.html.split('\n')
+		lines = lines[self.result[1][0]-1:self.result[2][0]]
+		lines[0] = lines[0][self.result[1][1]:]
+		if len(lines) == 1:
+			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
+		lines[-1] = lines[-1][:self.result[2][1]]
+		return '\n'.join(lines).strip()
+
+def get_element_by_id(id, html):
+	"""Return the content of the tag with the specified id in the passed HTML document"""
+	parser = IDParser(id)
+	try:
+		parser.loads(html)
+	except HTMLParser.HTMLParseError:
+		pass
+	return parser.get_result()
+
+
+def clean_html(html):
+	"""Clean an HTML snippet into a readable string"""
+	# Newline vs <br />
+	html = html.replace('\n', ' ')
+	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+	# Strip html tags
+	html = re.sub('<.*?>', '', html)
+	# Replace html entities
+	html = unescapeHTML(html)
+	return html
+
+
 def sanitize_title(utitle):
 	"""Sanitizes a video title so it could be used as part of a filename."""
-	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
+	utitle = unescapeHTML(utitle)
 	return utitle.replace(unicode(os.sep), u'%')


@ -133,8 +210,8 @@ def unescapeHTML(s):
 	"""
 	assert type(s) == type(u'')

-	htmlParser = HTMLParser.HTMLParser()
-	return htmlParser.unescape(s)
+	result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
+	return result

 def encodeFilename(s):
 	"""