[youtube] Extract chapters
This commit is contained in:
parent
329e3dd5ad
commit
9cafc3fd8b
2 changed files with 301 additions and 2 deletions
|
@ -1257,6 +1257,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
|
||||
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
|
||||
|
||||
@staticmethod
|
||||
def _extract_chapters(description, duration):
|
||||
if not description:
|
||||
return None
|
||||
chapter_lines = re.findall(
|
||||
r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
|
||||
description)
|
||||
if not chapter_lines:
|
||||
return None
|
||||
chapters = []
|
||||
for next_num, (chapter_line, time_point) in enumerate(
|
||||
chapter_lines, start=1):
|
||||
start_time = parse_duration(time_point)
|
||||
if start_time is None:
|
||||
continue
|
||||
end_time = (duration if next_num == len(chapter_lines)
|
||||
else parse_duration(chapter_lines[next_num][1]))
|
||||
if end_time is None:
|
||||
continue
|
||||
chapter_title = re.sub(
|
||||
r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
|
||||
chapter_title = re.sub(r'\s+', ' ', chapter_title)
|
||||
chapters.append({
|
||||
'start_time': start_time,
|
||||
'end_time': end_time,
|
||||
'title': chapter_title,
|
||||
})
|
||||
return chapters
|
||||
|
||||
def _real_extract(self, url):
|
||||
url, smuggled_data = unsmuggle_url(url, {})
|
||||
|
||||
|
@ -1399,9 +1428,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
video_title = '_'
|
||||
|
||||
# description
|
||||
video_description = get_element_by_id("eow-description", video_webpage)
|
||||
description_original = video_description = get_element_by_id("eow-description", video_webpage)
|
||||
if video_description:
|
||||
video_description = re.sub(r'''(?x)
|
||||
description_original = video_description = re.sub(r'''(?x)
|
||||
<a\s+
|
||||
(?:[a-zA-Z-]+="[^"]*"\s+)*?
|
||||
(?:title|href)="([^"]+)"\s+
|
||||
|
@ -1558,6 +1587,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
if self._downloader.params.get('writeannotations', False):
|
||||
video_annotations = self._extract_annotations(video_id)
|
||||
|
||||
chapters = self._extract_chapters(description_original, video_duration)
|
||||
|
||||
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
|
||||
self.report_rtmp_download()
|
||||
formats = [{
|
||||
|
@ -1790,6 +1821,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
|||
'duration': video_duration,
|
||||
'age_limit': 18 if age_gate else 0,
|
||||
'annotations': video_annotations,
|
||||
'chapters': chapters,
|
||||
'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
|
||||
'view_count': view_count,
|
||||
'like_count': like_count,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue