Merge remote-tracking branch 'upstream/master' into bliptv

2015-12-03 20:33:22 +01:00 · 2015-12-03 20:33:22 +01:00 · 78653a33aa
commit 78653a33aa
parent 77302fe5c9 24dc1ed715
188 changed files with 4047 additions and 1760 deletions
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -9,8 +9,8 @@ import sys
 from .common import InfoExtractor
 from .youtube import YoutubeIE
 from ..compat import (
+    compat_etree_fromstring,
    compat_urllib_parse_unquote,
-    compat_urllib_request,
    compat_urlparse,
    compat_xml_parse_error,
 )
@ -21,7 +21,7 @@ from ..utils import (
    HEADRequest,
    is_html,
    orderedSet,
-    parse_xml,
+    sanitized_Request,
    smuggle_url,
    unescapeHTML,
    unified_strdate,
@ -30,7 +30,10 @@ from ..utils import (
    url_basename,
    xpath_text,
 )
-from .brightcove import BrightcoveIE
+from .brightcove import (
+    BrightcoveLegacyIE,
+    BrightcoveNewIE,
+)
 from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
@ -140,6 +143,7 @@ class GenericIE(InfoExtractor):
                'ext': 'mp4',
                'title': 'Automatics, robotics and biocybernetics',
                'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+                'upload_date': '20130627',
                'formats': 'mincount:16',
                'subtitles': 'mincount:1',
            },
@ -273,7 +277,7 @@ class GenericIE(InfoExtractor):
        # it also tests brightcove videos that need to set the 'Referer' in the
        # http requests
        {
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
            'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
            'info_dict': {
                'id': '2765128793001',
@ -297,7 +301,7 @@ class GenericIE(InfoExtractor):
                'uploader': 'thestar.com',
                'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
            },
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
        },
        {
            'url': 'http://www.championat.com/video/football/v/87/87499.html',
@ -312,7 +316,7 @@ class GenericIE(InfoExtractor):
        },
        {
            # https://github.com/rg3/youtube-dl/issues/3541
-            'add_ie': ['Brightcove'],
+            'add_ie': ['BrightcoveLegacy'],
            'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
            'info_dict': {
                'id': '3866516442001',
@ -818,6 +822,19 @@ class GenericIE(InfoExtractor):
                'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
            },
        },
+        # Kaltura embed protected with referrer
+        {
+            'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero',
+            'info_dict': {
+                'id': '1_g4fbemnq',
+                'ext': 'mp4',
+                'title': 'Violetta - Achter De Schermen - Ruggero',
+                'description': 'Achter de schermen met Ruggero',
+                'timestamp': 1435133761,
+                'upload_date': '20150624',
+                'uploader_id': 'echojecka',
+            },
+        },
        # Eagle.Platform embed (generic URL)
        {
            'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@ -1029,6 +1046,31 @@ class GenericIE(InfoExtractor):
                'ext': 'mp4',
                'title': 'cinemasnob',
            },
+        },
+        # BrightcoveInPageEmbed embed
+        {
+            'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
+            'info_dict': {
+                'id': '4238694884001',
+                'ext': 'flv',
+                'title': 'Tabletop: Dread, Last Thoughts',
+                'description': 'Tabletop: Dread, Last Thoughts',
+                'duration': 51690,
+            },
+        },
+        # JWPlayer with M3U8
+        {
+            'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video',
+            'info_dict': {
+                'id': 'playlist',
+                'ext': 'mp4',
+                'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ',
+                'uploader': 'ren.tv',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            }
        }
    ]

@ -1172,7 +1214,7 @@ class GenericIE(InfoExtractor):

        full_response = None
        if head_response is False:
-            request = compat_urllib_request.Request(url)
+            request = sanitized_Request(url)
            request.add_header('Accept-Encoding', '*')
            full_response = self._request_webpage(request, video_id)
            head_response = full_response
@ -1201,7 +1243,7 @@ class GenericIE(InfoExtractor):
                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))

        if not full_response:
-            request = compat_urllib_request.Request(url)
+            request = sanitized_Request(url)
            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
            # making it impossible to download only chunk of the file (yet we need only 512kB to
            # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
@ -1236,7 +1278,7 @@ class GenericIE(InfoExtractor):

        # Is it an RSS feed, a SMIL file or a XSPF playlist?
        try:
-            doc = parse_xml(webpage)
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
            if doc.tag == 'rss':
                return self._extract_rss(url, video_id, doc)
            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
@ -1288,14 +1330,14 @@ class GenericIE(InfoExtractor):
            return self.playlist_result(
                urlrs, playlist_id=video_id, playlist_title=video_title)

-        # Look for BrightCove:
-        bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
+        # Look for Brightcove Legacy Studio embeds
+        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
        if bc_urls:
            self.to_screen('Brightcove video detected.')
            entries = [{
                '_type': 'url',
                'url': smuggle_url(bc_url, {'Referer': url}),
-                'ie_key': 'Brightcove'
+                'ie_key': 'BrightcoveLegacy'
            } for bc_url in bc_urls]

            return {
@ -1305,6 +1347,11 @@ class GenericIE(InfoExtractor):
                'entries': entries,
            }

+        # Look for Brightcove New Studio embeds
+        bc_urls = BrightcoveNewIE._extract_urls(webpage)
+        if bc_urls:
+            return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+
        # Look for embedded rtl.nl player
        matches = re.findall(
            r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
@ -1665,10 +1712,12 @@ class GenericIE(InfoExtractor):
            return self.url_result(mobj.group('url'), 'Zapiks')

        # Look for Kaltura embeds
-        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
-                re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
+        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+                re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
        if mobj is not None:
-            return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
+            return self.url_result(smuggle_url(
+                'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(),
+                {'source_url': url}), 'Kaltura')

        # Look for Eagle.Platform embeds
        mobj = re.search(
@ -1713,7 +1762,7 @@ class GenericIE(InfoExtractor):

        # Look for UDN embeds
        mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
+            r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
        if mobj is not None:
            return self.url_result(
                compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
@ -1833,6 +1882,7 @@ class GenericIE(InfoExtractor):

        entries = []
        for video_url in found:
+            video_url = video_url.replace('\\/', '/')
            video_url = compat_urlparse.urljoin(url, video_url)
            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))

@ -1844,25 +1894,24 @@ class GenericIE(InfoExtractor):
            # here's a fun little line of code for you:
            video_id = os.path.splitext(video_id)[0]

+            entry_info_dict = {
+                'id': video_id,
+                'uploader': video_uploader,
+                'title': video_title,
+                'age_limit': age_limit,
+            }
+
            ext = determine_ext(video_url)
            if ext == 'smil':
-                entries.append({
-                    'id': video_id,
-                    'formats': self._extract_smil_formats(video_url, video_id),
-                    'uploader': video_uploader,
-                    'title': video_title,
-                    'age_limit': age_limit,
-                })
+                entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)
            elif ext == 'xspf':
                return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+            elif ext == 'm3u8':
+                entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
            else:
-                entries.append({
-                    'id': video_id,
-                    'url': video_url,
-                    'uploader': video_uploader,
-                    'title': video_title,
-                    'age_limit': age_limit,
-                })
+                entry_info_dict['url'] = video_url
+
+            entries.append(entry_info_dict)

        if len(entries) == 1:
            return entries[0]