Add infrastructure for paged lists

This commit allows to download pages in playlists as needed instead of all at once.
Before this commit,
    youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download
took quite some time - now it's almost instantaneous.
As an example, the youtube:user extractor has been converted.
Fixes #2175
This commit is contained in:
Philipp Hagemeister 2014-01-20 11:36:47 +01:00
parent c91778f8c0
commit b7ab059084
4 changed files with 92 additions and 25 deletions

View file

@ -27,6 +27,7 @@ from ..utils import (
get_element_by_id,
get_element_by_attribute,
ExtractorError,
PagedList,
RegexNotFoundError,
unescapeHTML,
unified_strdate,
@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor):
# page by page until there are no video ids - it means we got
# all of them.
url_results = []
for pagenum in itertools.count(0):
def download_page(pagenum):
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
page = self._download_webpage(gdata_url, username,
u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
page = self._download_webpage(
gdata_url, username,
u'Downloading video ids from %d to %d' % (
start_index, start_index + self._GDATA_PAGE_SIZE))
try:
response = json.loads(page)
except ValueError as err:
raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
if 'entry' not in response['feed']:
# Number of videos is a multiple of self._MAX_RESULTS
break
return
# Extract video identifiers
entries = response['feed']['entry']
for entry in entries:
title = entry['title']['$t']
video_id = entry['id']['$t'].split('/')[-1]
url_results.append({
yield {
'_type': 'url',
'url': video_id,
'ie_key': 'Youtube',
'id': 'video_id',
'title': title,
})
# A little optimization - if current page is not
# "full", ie. does not contain PAGE_SIZE video ids then
# we can assume that this page is the last one - there
# are no more ids on further pages - no need to query
# again.
if len(entries) < self._GDATA_PAGE_SIZE:
break
}
url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
return self.playlist_result(url_results, playlist_title=username)