 67b7e712d8
			
		
	
	
		67b7e712d8
		
	
	
	
	
		
			
			We're now checking that the file is sufficiently large by default, and they seem to reencode the videos from time to time.
		
			
				
	
	
		
			130 lines
		
	
	
	
		
			4.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			130 lines
		
	
	
	
		
			4.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from __future__ import unicode_literals
 | |
| 
 | |
| import re
 | |
| 
 | |
| from .common import InfoExtractor
 | |
| from ..utils import (
 | |
|     ExtractorError,
 | |
|     unified_strdate,
 | |
|     str_to_int,
 | |
|     int_or_none,
 | |
|     parse_duration,
 | |
| )
 | |
| 
 | |
| 
 | |
| class XHamsterIE(InfoExtractor):
 | |
|     """Information Extractor for xHamster"""
 | |
|     _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
 | |
|     _TESTS = [
 | |
|         {
 | |
|             'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
 | |
|             'info_dict': {
 | |
|                 'id': '1509445',
 | |
|                 'ext': 'mp4',
 | |
|                 'title': 'FemaleAgent Shy beauty takes the bait',
 | |
|                 'upload_date': '20121014',
 | |
|                 'uploader_id': 'Ruseful2011',
 | |
|                 'duration': 893,
 | |
|                 'age_limit': 18,
 | |
|             }
 | |
|         },
 | |
|         {
 | |
|             'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
 | |
|             'info_dict': {
 | |
|                 'id': '2221348',
 | |
|                 'ext': 'mp4',
 | |
|                 'title': 'Britney Spears  Sexy Booty',
 | |
|                 'upload_date': '20130914',
 | |
|                 'uploader_id': 'jojo747400',
 | |
|                 'duration': 200,
 | |
|                 'age_limit': 18,
 | |
|             }
 | |
|         }
 | |
|     ]
 | |
| 
 | |
|     def _real_extract(self,url):
 | |
|         def extract_video_url(webpage):
 | |
|             mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
 | |
|             if mp4 is None:
 | |
|                 raise ExtractorError('Unable to extract media URL')
 | |
|             else:
 | |
|                 return mp4.group(1)
 | |
| 
 | |
|         def is_hd(webpage):
 | |
|             return '<div class=\'icon iconHD\'' in webpage
 | |
| 
 | |
|         mobj = re.match(self._VALID_URL, url)
 | |
| 
 | |
|         video_id = mobj.group('id')
 | |
|         seo = mobj.group('seo')
 | |
|         mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
 | |
|         webpage = self._download_webpage(mrss_url, video_id)
 | |
| 
 | |
|         title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
 | |
| 
 | |
|         # Only a few videos have an description
 | |
|         mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
 | |
|         description = mobj.group(1) if mobj else None
 | |
| 
 | |
|         upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',
 | |
|             webpage, 'upload date', fatal=False)
 | |
|         if upload_date:
 | |
|             upload_date = unified_strdate(upload_date)
 | |
| 
 | |
|         uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 | |
|             webpage, 'uploader id', default='anonymous')
 | |
| 
 | |
|         thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
 | |
| 
 | |
|         duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
 | |
|             webpage, 'duration', fatal=False))
 | |
| 
 | |
|         view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)
 | |
|         if view_count:
 | |
|             view_count = str_to_int(view_count)
 | |
| 
 | |
|         mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)
 | |
|         (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)
 | |
| 
 | |
|         mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)
 | |
|         comment_count = mobj.group('commentcount') if mobj else 0
 | |
| 
 | |
|         age_limit = self._rta_search(webpage)
 | |
| 
 | |
|         hd = is_hd(webpage)
 | |
| 
 | |
|         video_url = extract_video_url(webpage)
 | |
|         formats = [{
 | |
|             'url': video_url,
 | |
|             'format_id': 'hd' if hd else 'sd',
 | |
|             'preference': 1,
 | |
|         }]
 | |
| 
 | |
|         if not hd:
 | |
|             mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
 | |
|             webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
 | |
|             if is_hd(webpage):
 | |
|                 video_url = extract_video_url(webpage)
 | |
|                 formats.append({
 | |
|                     'url': video_url,
 | |
|                     'format_id': 'hd',
 | |
|                     'preference': 2,
 | |
|                 })
 | |
| 
 | |
|         self._sort_formats(formats)
 | |
| 
 | |
|         return {
 | |
|             'id': video_id,
 | |
|             'title': title,
 | |
|             'description': description,
 | |
|             'upload_date': upload_date,
 | |
|             'uploader_id': uploader_id,
 | |
|             'thumbnail': thumbnail,
 | |
|             'duration': duration,
 | |
|             'view_count': view_count,
 | |
|             'like_count': int_or_none(like_count),
 | |
|             'dislike_count': int_or_none(dislike_count),
 | |
|             'comment_count': int_or_none(comment_count),
 | |
|             'age_limit': age_limit,
 | |
|             'formats': formats,
 | |
|         }
 |