summaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/vimeo.py
blob: 6af70565781e391915d807f49639a859c8f1b9ff (plain)
    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import json
    5 import re
    6 import itertools
    7 
    8 from .common import InfoExtractor
    9 from ..compat import (
   10     compat_HTTPError,
   11     compat_str,
   12     compat_urlparse,
   13 )
   14 from ..utils import (
   15     determine_ext,
   16     ExtractorError,
   17     InAdvancePagedList,
   18     int_or_none,
   19     NO_DEFAULT,
   20     RegexNotFoundError,
   21     sanitized_Request,
   22     smuggle_url,
   23     std_headers,
   24     try_get,
   25     unified_timestamp,
   26     unsmuggle_url,
   27     urlencode_postdata,
   28     unescapeHTML,
   29     parse_filesize,
   30 )
   31 
   32 
   33 class VimeoBaseInfoExtractor(InfoExtractor):
   34     _NETRC_MACHINE = 'vimeo'
   35     _LOGIN_REQUIRED = False
   36     _LOGIN_URL = 'https://vimeo.com/log_in'
   37 
   38     def _login(self):
   39         (username, password) = self._get_login_info()
   40         if username is None:
   41             if self._LOGIN_REQUIRED:
   42                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
   43             return
   44         self.report_login()
   45         webpage = self._download_webpage(self._LOGIN_URL, None, False)
   46         token, vuid = self._extract_xsrft_and_vuid(webpage)
   47         data = urlencode_postdata({
   48             'action': 'login',
   49             'email': username,
   50             'password': password,
   51             'service': 'vimeo',
   52             'token': token,
   53         })
   54         login_request = sanitized_Request(self._LOGIN_URL, data)
   55         login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
   56         login_request.add_header('Referer', self._LOGIN_URL)
   57         self._set_vimeo_cookie('vuid', vuid)
   58         self._download_webpage(login_request, None, False, 'Wrong login info')
   59 
   60     def _verify_video_password(self, url, video_id, webpage):
   61         password = self._downloader.params.get('videopassword')
   62         if password is None:
   63             raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
   64         token, vuid = self._extract_xsrft_and_vuid(webpage)
   65         data = urlencode_postdata({
   66             'password': password,
   67             'token': token,
   68         })
   69         if url.startswith('http://'):
   70             # vimeo only supports https now, but the user can give an http url
   71             url = url.replace('http://', 'https://')
   72         password_request = sanitized_Request(url + '/password', data)
   73         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
   74         password_request.add_header('Referer', url)
   75         self._set_vimeo_cookie('vuid', vuid)
   76         return self._download_webpage(
   77             password_request, video_id,
   78             'Verifying the password', 'Wrong password')
   79 
   80     def _extract_xsrft_and_vuid(self, webpage):
   81         xsrft = self._search_regex(
   82             r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
   83             webpage, 'login token', group='xsrft')
   84         vuid = self._search_regex(
   85             r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1',
   86             webpage, 'vuid', group='vuid')
   87         return xsrft, vuid
   88 
   89     def _set_vimeo_cookie(self, name, value):
   90         self._set_cookie('vimeo.com', name, value)
   91 
   92     def _vimeo_sort_formats(self, formats):
   93         # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
   94         # at the same time without actual units specified. This lead to wrong sorting.
   95         self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id'))
   96 
   97     def _parse_config(self, config, video_id):
   98         video_data = config['video']
   99         # Extract title
  100         video_title = video_data['title']
  101 
  102         # Extract uploader, uploader_url and uploader_id
  103         video_uploader = video_data.get('owner', {}).get('name')
  104         video_uploader_url = video_data.get('owner', {}).get('url')
  105         video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None
  106 
  107         # Extract video thumbnail
  108         video_thumbnail = video_data.get('thumbnail')
  109         if video_thumbnail is None:
  110             video_thumbs = video_data.get('thumbs')
  111             if video_thumbs and isinstance(video_thumbs, dict):
  112                 _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
  113 
  114         # Extract video duration
  115         video_duration = int_or_none(video_data.get('duration'))
  116 
  117         formats = []
  118         config_files = video_data.get('files') or config['request'].get('files', {})
  119         for f in config_files.get('progressive', []):
  120             video_url = f.get('url')
  121             if not video_url:
  122                 continue
  123             formats.append({
  124                 'url': video_url,
  125                 'format_id': 'http-%s' % f.get('quality'),
  126                 'width': int_or_none(f.get('width')),
  127                 'height': int_or_none(f.get('height')),
  128                 'fps': int_or_none(f.get('fps')),
  129                 'tbr': int_or_none(f.get('bitrate')),
  130             })
  131 
  132         for files_type in ('hls', 'dash'):
  133             for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items():
  134                 manifest_url = cdn_data.get('url')
  135                 if not manifest_url:
  136                     continue
  137                 format_id = '%s-%s' % (files_type, cdn_name)
  138                 if files_type == 'hls':
  139                     formats.extend(self._extract_m3u8_formats(
  140                         manifest_url, video_id, 'mp4',
  141                         'm3u8_native', m3u8_id=format_id,
  142                         note='Downloading %s m3u8 information' % cdn_name,
  143                         fatal=False))
  144                 elif files_type == 'dash':
  145                     mpd_pattern = r'/%s/(?:sep/)?video/' % video_id
  146                     mpd_manifest_urls = []
  147                     if re.search(mpd_pattern, manifest_url):
  148                         for suffix, repl in (('', 'video'), ('_sep', 'sep/video')):
  149                             mpd_manifest_urls.append((format_id + suffix, re.sub(
  150                                 mpd_pattern, '/%s/%s/' % (video_id, repl), manifest_url)))
  151                     else:
  152                         mpd_manifest_urls = [(format_id, manifest_url)]
  153                     for f_id, m_url in mpd_manifest_urls:
  154                         mpd_formats = self._extract_mpd_formats(
  155                             m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
  156                             'Downloading %s MPD information' % cdn_name,
  157                             fatal=False)
  158                         for f in mpd_formats:
  159                             if f.get('vcodec') == 'none':
  160                                 f['preference'] = -50
  161                             elif f.get('acodec') == 'none':
  162                                 f['preference'] = -40
  163                         formats.extend(mpd_formats)
  164 
  165         subtitles = {}
  166         text_tracks = config['request'].get('text_tracks')
  167         if text_tracks:
  168             for tt in text_tracks:
  169                 subtitles[tt['lang']] = [{
  170                     'ext': 'vtt',
  171                     'url': 'https://vimeo.com' + tt['url'],
  172                 }]
  173 
  174         return {
  175             'title': video_title,
  176             'uploader': video_uploader,
  177             'uploader_id': video_uploader_id,
  178             'uploader_url': video_uploader_url,
  179             'thumbnail': video_thumbnail,
  180             'duration': video_duration,
  181             'formats': formats,
  182             'subtitles': subtitles,
  183         }
  184 
  185 
  186 class VimeoIE(VimeoBaseInfoExtractor):
  187     """Information extractor for vimeo.com."""
  188 
  189     # _VALID_URL matches Vimeo URLs
  190     _VALID_URL = r'''(?x)
  191                     https?://
  192                         (?:
  193                             (?:
  194                                 www|
  195                                 (?P<player>player)
  196                             )
  197                             \.
  198                         )?
  199                         vimeo(?P<pro>pro)?\.com/
  200                         (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
  201                         (?:.*?/)?
  202                         (?:
  203                             (?:
  204                                 play_redirect_hls|
  205                                 moogaloop\.swf)\?clip_id=
  206                             )?
  207                         (?:videos?/)?
  208                         (?P<id>[0-9]+)
  209                         (?:/[\da-f]+)?
  210                         /?(?:[?&].*)?(?:[#].*)?$
  211                     '''
  212     IE_NAME = 'vimeo'
  213     _TESTS = [
  214         {
  215             'url': 'http://vimeo.com/56015672#at=0',
  216             'md5': '8879b6cc097e987f02484baf890129e5',
  217             'info_dict': {
  218                 'id': '56015672',
  219                 'ext': 'mp4',
  220                 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
  221                 'description': 'md5:2d3305bad981a06ff79f027f19865021',
  222                 'timestamp': 1355990239,
  223                 'upload_date': '20121220',
  224                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434',
  225                 'uploader_id': 'user7108434',
  226                 'uploader': 'Filippo Valsorda',
  227                 'duration': 10,
  228                 'license': 'by-sa',
  229             },
  230         },
  231         {
  232             'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
  233             'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82',
  234             'note': 'Vimeo Pro video (#1197)',
  235             'info_dict': {
  236                 'id': '68093876',
  237                 'ext': 'mp4',
  238                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus',
  239                 'uploader_id': 'openstreetmapus',
  240                 'uploader': 'OpenStreetMap US',
  241                 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
  242                 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30',
  243                 'duration': 1595,
  244             },
  245         },
  246         {
  247             'url': 'http://player.vimeo.com/video/54469442',
  248             'md5': '619b811a4417aa4abe78dc653becf511',
  249             'note': 'Videos that embed the url in the player page',
  250             'info_dict': {
  251                 'id': '54469442',
  252                 'ext': 'mp4',
  253                 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',
  254                 'uploader': 'The BLN & Business of Software',
  255                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',
  256                 'uploader_id': 'theblnbusinessofsoftware',
  257                 'duration': 3610,
  258                 'description': None,
  259             },
  260         },
  261         {
  262             'url': 'http://vimeo.com/68375962',
  263             'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
  264             'note': 'Video protected with password',
  265             'info_dict': {
  266                 'id': '68375962',
  267                 'ext': 'mp4',
  268                 'title': 'youtube-dl password protected test video',
  269                 'timestamp': 1371200155,
  270                 'upload_date': '20130614',
  271                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
  272                 'uploader_id': 'user18948128',
  273                 'uploader': 'Jaime Marquínez Ferrándiz',
  274                 'duration': 10,
  275                 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
  276             },
  277             'params': {
  278                 'videopassword': 'youtube-dl',
  279             },
  280         },
  281         {
  282             'url': 'http://vimeo.com/channels/keypeele/75629013',
  283             'md5': '2f86a05afe9d7abc0b9126d229bbe15d',
  284             'info_dict': {
  285                 'id': '75629013',
  286                 'ext': 'mp4',
  287                 'title': 'Key & Peele: Terrorist Interrogation',
  288                 'description': 'md5:8678b246399b070816b12313e8b4eb5c',
  289                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio',
  290                 'uploader_id': 'atencio',
  291                 'uploader': 'Peter Atencio',
  292                 'timestamp': 1380339469,
  293                 'upload_date': '20130928',
  294                 'duration': 187,
  295             },
  296         },
  297         {
  298             'url': 'http://vimeo.com/76979871',
  299             'note': 'Video with subtitles',
  300             'info_dict': {
  301                 'id': '76979871',
  302                 'ext': 'mp4',
  303                 'title': 'The New Vimeo Player (You Know, For Videos)',
  304                 'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
  305                 'timestamp': 1381846109,
  306                 'upload_date': '20131015',
  307                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff',
  308                 'uploader_id': 'staff',
  309                 'uploader': 'Vimeo Staff',
  310                 'duration': 62,
  311             }
  312         },
  313         {
  314             # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/
  315             'url': 'https://player.vimeo.com/video/98044508',
  316             'note': 'The js code contains assignments to the same variable as the config',
  317             'info_dict': {
  318                 'id': '98044508',
  319                 'ext': 'mp4',
  320                 'title': 'Pier Solar OUYA Official Trailer',
  321                 'uploader': 'Tulio Gonçalves',
  322                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593',
  323                 'uploader_id': 'user28849593',
  324             },
  325         },
  326         {
  327             # contains original format
  328             'url': 'https://vimeo.com/33951933',
  329             'md5': '53c688fa95a55bf4b7293d37a89c5c53',
  330             'info_dict': {
  331                 'id': '33951933',
  332                 'ext': 'mp4',
  333                 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute',
  334                 'uploader': 'The DMCI',
  335                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci',
  336                 'uploader_id': 'dmci',
  337                 'timestamp': 1324343742,
  338                 'upload_date': '20111220',
  339                 'description': 'md5:ae23671e82d05415868f7ad1aec21147',
  340             },
  341         },
  342         {
  343             # only available via https://vimeo.com/channels/tributes/6213729 and
  344             # not via https://vimeo.com/6213729
  345             'url': 'https://vimeo.com/channels/tributes/6213729',
  346             'info_dict': {
  347                 'id': '6213729',
  348                 'ext': 'mov',
  349                 'title': 'Vimeo Tribute: The Shining',
  350                 'uploader': 'Casey Donahue',
  351                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue',
  352                 'uploader_id': 'caseydonahue',
  353                 'timestamp': 1250886430,
  354                 'upload_date': '20090821',
  355                 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
  356             },
  357             'params': {
  358                 'skip_download': True,
  359             },
  360             'expected_warnings': ['Unable to download JSON metadata'],
  361         },
  362         {
  363             # redirects to ondemand extractor and should be passed through it
  364             # for successful extraction
  365             'url': 'https://vimeo.com/73445910',
  366             'info_dict': {
  367                 'id': '73445910',
  368                 'ext': 'mp4',
  369                 'title': 'The Reluctant Revolutionary',
  370                 'uploader': '10Ft Films',
  371                 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms',
  372                 'uploader_id': 'tenfootfilms',
  373             },
  374             'params': {
  375                 'skip_download': True,
  376             },
  377         },
  378         {
  379             'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
  380             'only_matching': True,
  381         },
  382         {
  383             'url': 'https://vimeo.com/109815029',
  384             'note': 'Video not completely processed, "failed" seed status',
  385             'only_matching': True,
  386         },
  387         {
  388             'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
  389             'only_matching': True,
  390         },
  391         {
  392             'url': 'https://vimeo.com/album/2632481/video/79010983',
  393             'only_matching': True,
  394         },
  395         {
  396             # source file returns 403: Forbidden
  397             'url': 'https://vimeo.com/7809605',
  398             'only_matching': True,
  399         },
  400         {
  401             'url': 'https://vimeo.com/160743502/abd0e13fb4',
  402             'only_matching': True,
  403         }
  404     ]
  405 
  406     @staticmethod
  407     def _smuggle_referrer(url, referrer_url):
  408         return smuggle_url(url, {'http_headers': {'Referer': referrer_url}})
  409 
  410     @staticmethod
  411     def _extract_urls(url, webpage):
  412         urls = []
  413         # Look for embedded (iframe) Vimeo player
  414         for mobj in re.finditer(
  415                 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1',
  416                 webpage):
  417             urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url))
  418         PLAIN_EMBED_RE = (
  419             # Look for embedded (swf embed) Vimeo player
  420             r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1',
  421             # Look more for non-standard embedded Vimeo player
  422             r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1',
  423         )
  424         for embed_re in PLAIN_EMBED_RE:
  425             for mobj in re.finditer(embed_re, webpage):
  426                 urls.append(mobj.group('url'))
  427         return urls
  428 
  429     @staticmethod
  430     def _extract_url(url, webpage):
  431         urls = VimeoIE._extract_urls(url, webpage)
  432         return urls[0] if urls else None
  433 
  434     def _verify_player_video_password(self, url, video_id):
  435         password = self._downloader.params.get('videopassword')
  436         if password is None:
  437             raise ExtractorError('This video is protected by a password, use the --video-password option')
  438         data = urlencode_postdata({'password': password})
  439         pass_url = url + '/check-password'
  440         password_request = sanitized_Request(pass_url, data)
  441         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
  442         password_request.add_header('Referer', url)
  443         return self._download_json(
  444             password_request, video_id,
  445             'Verifying the password', 'Wrong password')
  446 
  447     def _real_initialize(self):
  448         self._login()
  449 
  450     def _real_extract(self, url):
  451         url, data = unsmuggle_url(url, {})
  452         headers = std_headers.copy()
  453         if 'http_headers' in data:
  454             headers.update(data['http_headers'])
  455         if 'Referer' not in headers:
  456             headers['Referer'] = url
  457 
  458         # Extract ID from URL
  459         mobj = re.match(self._VALID_URL, url)
  460         video_id = mobj.group('id')
  461         orig_url = url
  462         if mobj.group('pro') or mobj.group('player'):
  463             url = 'https://player.vimeo.com/video/' + video_id
  464         elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
  465             url = 'https://vimeo.com/' + video_id
  466 
  467         # Retrieve video webpage to extract further information
  468         request = sanitized_Request(url, headers=headers)
  469         try:
  470             webpage, urlh = self._download_webpage_handle(request, video_id)
  471             redirect_url = compat_str(urlh.geturl())
  472             # Some URLs redirect to ondemand can't be extracted with
  473             # this extractor right away thus should be passed through
  474             # ondemand extractor (e.g. https://vimeo.com/73445910)
  475             if VimeoOndemandIE.suitable(redirect_url):
  476                 return self.url_result(redirect_url, VimeoOndemandIE.ie_key())
  477         except ExtractorError as ee:
  478             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
  479                 errmsg = ee.cause.read()
  480                 if b'Because of its privacy settings, this video cannot be played here' in errmsg:
  481                     raise ExtractorError(
  482                         'Cannot download embed-only video without embedding '
  483                         'URL. Please call youtube-dl with the URL of the page '
  484                         'that embeds this video.',
  485                         expected=True)
  486             raise
  487 
  488         # Now we begin extracting as much information as we can from what we
  489         # retrieved. First we extract the information common to all extractors,
  490         # and latter we extract those that are Vimeo specific.
  491         self.report_extraction(video_id)
  492 
  493         vimeo_config = self._search_regex(
  494             r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', webpage,
  495             'vimeo config', default=None)
  496         if vimeo_config:
  497             seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {})
  498             if seed_status.get('state') == 'failed':
  499                 raise ExtractorError(
  500                     '%s said: %s' % (self.IE_NAME, seed_status['title']),
  501                     expected=True)
  502 
  503         cc_license = None
  504         timestamp = None
  505 
  506         # Extract the config JSON
  507         try:
  508             try:
  509                 config_url = self._html_search_regex(
  510                     r' data-config-url="(.+?)"', webpage,
  511                     'config URL', default=None)
  512                 if not config_url:
  513                     # Sometimes new react-based page is served instead of old one that require
  514                     # different config URL extraction approach (see
  515                     # https://github.com/rg3/youtube-dl/pull/7209)
  516                     vimeo_clip_page_config = self._search_regex(
  517                         r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage,
  518                         'vimeo clip page config')
  519                     page_config = self._parse_json(vimeo_clip_page_config, video_id)
  520                     config_url = page_config['player']['config_url']
  521                     cc_license = page_config.get('cc_license')
  522                     timestamp = try_get(
  523                         page_config, lambda x: x['clip']['uploaded_on'],
  524                         compat_str)
  525                 config_json = self._download_webpage(config_url, video_id)
  526                 config = json.loads(config_json)
  527             except RegexNotFoundError:
  528                 # For pro videos or player.vimeo.com urls
  529                 # We try to find out to which variable is assigned the config dic
  530                 m_variable_name = re.search(r'(\w)\.video\.id', webpage)
  531                 if m_variable_name is not None:
  532                     config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))
  533                 else:
  534                     config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
  535                 config = self._search_regex(config_re, webpage, 'info section',
  536                                             flags=re.DOTALL)
  537                 config = json.loads(config)
  538         except Exception as e:
  539             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
  540                 raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
  541 
  542             if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
  543                 if '_video_password_verified' in data:
  544                     raise ExtractorError('video password verification failed!')
  545                 self._verify_video_password(redirect_url, video_id, webpage)
  546                 return self._real_extract(
  547                     smuggle_url(redirect_url, {'_video_password_verified': 'verified'}))
  548             else:
  549                 raise ExtractorError('Unable to extract info section',
  550                                      cause=e)
  551         else:
  552             if config.get('view') == 4:
  553                 config = self._verify_player_video_password(redirect_url, video_id)
  554 
  555         def is_rented():
  556             if '>You rented this title.<' in webpage:
  557                 return True
  558             if config.get('user', {}).get('purchased'):
  559                 return True
  560             label = try_get(
  561                 config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str)
  562             if label and label.startswith('You rented this'):
  563                 return True
  564             return False
  565 
  566         if is_rented():
  567             feature_id = config.get('video', {}).get('vod', {}).get('feature_id')
  568             if feature_id and not data.get('force_feature_id', False):
  569                 return self.url_result(smuggle_url(
  570                     'https://player.vimeo.com/player/%s' % feature_id,
  571                     {'force_feature_id': True}), 'Vimeo')
  572 
  573         # Extract video description
  574 
  575         video_description = self._html_search_regex(
  576             r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
  577             webpage, 'description', default=None)
  578         if not video_description:
  579             video_description = self._html_search_meta(
  580                 'description', webpage, default=None)
  581         if not video_description and mobj.group('pro'):
  582             orig_webpage = self._download_webpage(
  583                 orig_url, video_id,
  584                 note='Downloading webpage for description',
  585                 fatal=False)
  586             if orig_webpage:
  587                 video_description = self._html_search_meta(
  588                     'description', orig_webpage, default=None)
  589         if not video_description and not mobj.group('player'):
  590             self._downloader.report_warning('Cannot find video description')
  591 
  592         # Extract upload date
  593         if not timestamp:
  594             timestamp = self._search_regex(
  595                 r'<time[^>]+datetime="([^"]+)"', webpage,
  596                 'timestamp', default=None)
  597 
  598         try:
  599             view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
  600             like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
  601             comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
  602         except RegexNotFoundError:
  603             # This info is only available in vimeo.com/{id} urls
  604             view_count = None
  605             like_count = None
  606             comment_count = None
  607 
  608         formats = []
  609         download_request = sanitized_Request('https://vimeo.com/%s?action=load_download_config' % video_id, headers={
  610             'X-Requested-With': 'XMLHttpRequest'})
  611         download_data = self._download_json(download_request, video_id, fatal=False)
  612         if download_data:
  613             source_file = download_data.get('source_file')
  614             if isinstance(source_file, dict):
  615                 download_url = source_file.get('download_url')
  616                 if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
  617                     source_name = source_file.get('public_name', 'Original')
  618                     if self._is_valid_url(download_url, video_id, '%s video' % source_name):
  619                         ext = (try_get(
  620                             source_file, lambda x: x['extension'],
  621                             compat_str) or determine_ext(
  622                             download_url, None) or 'mp4').lower()
  623                         formats.append({
  624                             'url': download_url,
  625                             'ext': ext,
  626                             'width': int_or_none(source_file.get('width')),
  627                             'height': int_or_none(source_file.get('height')),
  628                             'filesize': parse_filesize(source_file.get('size')),
  629                             'format_id': source_name,
  630                             'preference': 1,
  631                         })
  632 
  633         info_dict = self._parse_config(config, video_id)
  634         formats.extend(info_dict['formats'])
  635         self._vimeo_sort_formats(formats)
  636 
  637         if not cc_license:
  638             cc_license = self._search_regex(
  639                 r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1',
  640                 webpage, 'license', default=None, group='license')
  641 
  642         info_dict.update({
  643             'id': video_id,
  644             'formats': formats,
  645             'timestamp': unified_timestamp(timestamp),
  646             'description': video_description,
  647             'webpage_url': url,
  648             'view_count': view_count,
  649             'like_count': like_count,
  650             'comment_count': comment_count,
  651             'license': cc_license,
  652         })
  653 
  654         return info_dict
  655 
  656 
  657 class VimeoOndemandIE(VimeoBaseInfoExtractor):
  658     IE_NAME = 'vimeo:ondemand'
  659     _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P<id>[^/?#&]+)'
  660     _TESTS = [{
  661         # ondemand video not available via https://vimeo.com/id
  662         'url': 'https://vimeo.com/ondemand/20704',
  663         'md5': 'c424deda8c7f73c1dfb3edd7630e2f35',
  664         'info_dict': {
  665             'id': '105442900',
  666             'ext': 'mp4',
  667             'title': 'המעבדה - במאי יותם פלדמן',
  668             'uploader': 'גם סרטים',
  669             'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms',
  670             'uploader_id': 'gumfilms',
  671         },
  672         'params': {
  673             'format': 'best[protocol=https]',
  674         },
  675     }, {
  676         # requires Referer to be passed along with og:video:url
  677         'url': 'https://vimeo.com/ondemand/36938/126682985',
  678         'info_dict': {
  679             'id': '126682985',
  680             'ext': 'mp4',
  681             'title': 'Rävlock, rätt läte på rätt plats',
  682             'uploader': 'Lindroth & Norin',
  683             'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847',
  684             'uploader_id': 'user14430847',
  685         },
  686         'params': {
  687             'skip_download': True,
  688         },
  689     }, {
  690         'url': 'https://vimeo.com/ondemand/nazmaalik',
  691         'only_matching': True,
  692     }, {
  693         'url': 'https://vimeo.com/ondemand/141692381',
  694         'only_matching': True,
  695     }, {
  696         'url': 'https://vimeo.com/ondemand/thelastcolony/150274832',
  697         'only_matching': True,
  698     }]
  699 
  700     def _real_extract(self, url):
  701         video_id = self._match_id(url)
  702         webpage = self._download_webpage(url, video_id)
  703         return self.url_result(
  704             # Some videos require Referer to be passed along with og:video:url
  705             # similarly to generic vimeo embeds (e.g.
  706             # https://vimeo.com/ondemand/36938/126682985).
  707             VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url),
  708             VimeoIE.ie_key())
  709 
  710 
  711 class VimeoChannelIE(VimeoBaseInfoExtractor):
  712     IE_NAME = 'vimeo:channel'
  713     _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
  714     _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
  715     _TITLE = None
  716     _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
  717     _TESTS = [{
  718         'url': 'https://vimeo.com/channels/tributes',
  719         'info_dict': {
  720             'id': 'tributes',
  721             'title': 'Vimeo Tributes',
  722         },
  723         'playlist_mincount': 25,
  724     }]
  725 
  726     def _page_url(self, base_url, pagenum):
  727         return '%s/videos/page:%d/' % (base_url, pagenum)
  728 
  729     def _extract_list_title(self, webpage):
  730         return self._TITLE or self._html_search_regex(self._TITLE_RE, webpage, 'list title')
  731 
  732     def _login_list_password(self, page_url, list_id, webpage):
  733         login_form = self._search_regex(
  734             r'(?s)<form[^>]+?id="pw_form"(.*?)</form>',
  735             webpage, 'login form', default=None)
  736         if not login_form:
  737             return webpage
  738 
  739         password = self._downloader.params.get('videopassword')
  740         if password is None:
  741             raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
  742         fields = self._hidden_inputs(login_form)
  743         token, vuid = self._extract_xsrft_and_vuid(webpage)
  744         fields['token'] = token
  745         fields['password'] = password
  746         post = urlencode_postdata(fields)
  747         password_path = self._search_regex(
  748             r'action="([^"]+)"', login_form, 'password URL')
  749         password_url = compat_urlparse.urljoin(page_url, password_path)
  750         password_request = sanitized_Request(password_url, post)
  751         password_request.add_header('Content-type', 'application/x-www-form-urlencoded')
  752         self._set_vimeo_cookie('vuid', vuid)
  753         self._set_vimeo_cookie('xsrft', token)
  754 
  755         return self._download_webpage(
  756             password_request, list_id,
  757             'Verifying the password', 'Wrong password')
  758 
  759     def _title_and_entries(self, list_id, base_url):
  760         for pagenum in itertools.count(1):
  761             page_url = self._page_url(base_url, pagenum)
  762             webpage = self._download_webpage(
  763                 page_url, list_id,
  764                 'Downloading page %s' % pagenum)
  765 
  766             if pagenum == 1:
  767                 webpage = self._login_list_password(page_url, list_id, webpage)
  768                 yield self._extract_list_title(webpage)
  769 
  770             # Try extracting href first since not all videos are available via
  771             # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729)
  772             clips = re.findall(
  773                 r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)(?:[^>]+\btitle="([^"]+)")?', webpage)
  774             if clips:
  775                 for video_id, video_url, video_title in clips:
  776                     yield self.url_result(
  777                         compat_urlparse.urljoin(base_url, video_url),
  778                         VimeoIE.ie_key(), video_id=video_id, video_title=video_title)
  779             # More relaxed fallback
  780             else:
  781                 for video_id in re.findall(r'id=["\']clip_(\d+)', webpage):
  782                     yield self.url_result(
  783                         'https://vimeo.com/%s' % video_id,
  784                         VimeoIE.ie_key(), video_id=video_id)
  785 
  786             if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
  787                 break
  788 
  789     def _extract_videos(self, list_id, base_url):
  790         title_and_entries = self._title_and_entries(list_id, base_url)
  791         list_title = next(title_and_entries)
  792         return self.playlist_result(title_and_entries, list_id, list_title)
  793 
  794     def _real_extract(self, url):
  795         mobj = re.match(self._VALID_URL, url)
  796         channel_id = mobj.group('id')
  797         return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id)
  798 
  799 
  800 class VimeoUserIE(VimeoChannelIE):
  801     IE_NAME = 'vimeo:user'
  802     _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
  803     _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
  804     _TESTS = [{
  805         'url': 'https://vimeo.com/nkistudio/videos',
  806         'info_dict': {
  807             'title': 'Nki',
  808             'id': 'nkistudio',
  809         },
  810         'playlist_mincount': 66,
  811     }]
  812 
  813     def _real_extract(self, url):
  814         mobj = re.match(self._VALID_URL, url)
  815         name = mobj.group('name')
  816         return self._extract_videos(name, 'https://vimeo.com/%s' % name)
  817 
  818 
  819 class VimeoAlbumIE(VimeoChannelIE):
  820     IE_NAME = 'vimeo:album'
  821     _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))'
  822     _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
  823     _TESTS = [{
  824         'url': 'https://vimeo.com/album/2632481',
  825         'info_dict': {
  826             'id': '2632481',
  827             'title': 'Staff Favorites: November 2013',
  828         },
  829         'playlist_mincount': 13,
  830     }, {
  831         'note': 'Password-protected album',
  832         'url': 'https://vimeo.com/album/3253534',
  833         'info_dict': {
  834             'title': 'test',
  835             'id': '3253534',
  836         },
  837         'playlist_count': 1,
  838         'params': {
  839             'videopassword': 'youtube-dl',
  840         }
  841     }, {
  842         'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail',
  843         'only_matching': True,
  844     }, {
  845         # TODO: respect page number
  846         'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail',
  847         'only_matching': True,
  848     }]
  849 
  850     def _page_url(self, base_url, pagenum):
  851         return '%s/page:%d/' % (base_url, pagenum)
  852 
  853     def _real_extract(self, url):
  854         album_id = self._match_id(url)
  855         return self._extract_videos(album_id, 'https://vimeo.com/album/%s' % album_id)
  856 
  857 
  858 class VimeoGroupsIE(VimeoAlbumIE):
  859     IE_NAME = 'vimeo:group'
  860     _VALID_URL = r'https://vimeo\.com/groups/(?P<name>[^/]+)(?:/(?!videos?/\d+)|$)'
  861     _TESTS = [{
  862         'url': 'https://vimeo.com/groups/rolexawards',
  863         'info_dict': {
  864             'id': 'rolexawards',
  865             'title': 'Rolex Awards for Enterprise',
  866         },
  867         'playlist_mincount': 73,
  868     }]
  869 
  870     def _extract_list_title(self, webpage):
  871         return self._og_search_title(webpage)
  872 
  873     def _real_extract(self, url):
  874         mobj = re.match(self._VALID_URL, url)
  875         name = mobj.group('name')
  876         return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name)
  877 
  878 
  879 class VimeoReviewIE(VimeoBaseInfoExtractor):
  880     IE_NAME = 'vimeo:review'
  881     IE_DESC = 'Review pages on vimeo'
  882     _VALID_URL = r'https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)'
  883     _TESTS = [{
  884         'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',
  885         'md5': 'c507a72f780cacc12b2248bb4006d253',
  886         'info_dict': {
  887             'id': '75524534',
  888             'ext': 'mp4',
  889             'title': "DICK HARDWICK 'Comedian'",
  890             'uploader': 'Richard Hardwick',
  891             'uploader_id': 'user21297594',
  892         }
  893     }, {
  894         'note': 'video player needs Referer',
  895         'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',
  896         'md5': '6295fdab8f4bf6a002d058b2c6dce276',
  897         'info_dict': {
  898             'id': '91613211',
  899             'ext': 'mp4',
  900             'title': 're:(?i)^Death by dogma versus assembling agile . Sander Hoogendoorn',
  901             'uploader': 'DevWeek Events',
  902             'duration': 2773,
  903             'thumbnail': r're:^https?://.*\.jpg$',
  904             'uploader_id': 'user22258446',
  905         }
  906     }, {
  907         'note': 'Password protected',
  908         'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
  909         'info_dict': {
  910             'id': '138823582',
  911             'ext': 'mp4',
  912             'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1',
  913             'uploader': 'TMB',
  914             'uploader_id': 'user37284429',
  915         },
  916         'params': {
  917             'videopassword': 'holygrail',
  918         },
  919         'skip': 'video gone',
  920     }]
  921 
  922     def _real_initialize(self):
  923         self._login()
  924 
  925     def _get_config_url(self, webpage_url, video_id, video_password_verified=False):
  926         webpage = self._download_webpage(webpage_url, video_id)
  927         config_url = self._html_search_regex(
  928             r'data-config-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
  929             'config URL', default=None, group='url')
  930         if not config_url:
  931             data = self._parse_json(self._search_regex(
  932                 r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data',
  933                 default=NO_DEFAULT if video_password_verified else '{}'), video_id)
  934             config_url = data.get('vimeo_esi', {}).get('config', {}).get('configUrl')
  935         if config_url is None:
  936             self._verify_video_password(webpage_url, video_id, webpage)
  937             config_url = self._get_config_url(
  938                 webpage_url, video_id, video_password_verified=True)
  939         return config_url
  940 
  941     def _real_extract(self, url):
  942         video_id = self._match_id(url)
  943         config_url = self._get_config_url(url, video_id)
  944         config = self._download_json(config_url, video_id)
  945         info_dict = self._parse_config(config, video_id)
  946         self._vimeo_sort_formats(info_dict['formats'])
  947         info_dict['id'] = video_id
  948         return info_dict
  949 
  950 
  951 class VimeoWatchLaterIE(VimeoChannelIE):
  952     IE_NAME = 'vimeo:watchlater'
  953     IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)'
  954     _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater'
  955     _TITLE = 'Watch Later'
  956     _LOGIN_REQUIRED = True
  957     _TESTS = [{
  958         'url': 'https://vimeo.com/watchlater',
  959         'only_matching': True,
  960     }]
  961 
  962     def _real_initialize(self):
  963         self._login()
  964 
  965     def _page_url(self, base_url, pagenum):
  966         url = '%s/page:%d/' % (base_url, pagenum)
  967         request = sanitized_Request(url)
  968         # Set the header to get a partial html page with the ids,
  969         # the normal page doesn't contain them.
  970         request.add_header('X-Requested-With', 'XMLHttpRequest')
  971         return request
  972 
  973     def _real_extract(self, url):
  974         return self._extract_videos('watchlater', 'https://vimeo.com/watchlater')
  975 
  976 
  977 class VimeoLikesIE(InfoExtractor):
  978     _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
  979     IE_NAME = 'vimeo:likes'
  980     IE_DESC = 'Vimeo user likes'
  981     _TEST = {
  982         'url': 'https://vimeo.com/user755559/likes/',
  983         'playlist_mincount': 293,
  984         'info_dict': {
  985             'id': 'user755559_likes',
  986             'description': 'See all the videos urza likes',
  987             'title': 'Videos urza likes',
  988         },
  989     }
  990 
  991     def _real_extract(self, url):
  992         user_id = self._match_id(url)
  993         webpage = self._download_webpage(url, user_id)
  994         page_count = self._int(
  995             self._search_regex(
  996                 r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
  997                     .*?</a></li>\s*<li\s+class="pagination_next">
  998                 ''', webpage, 'page count'),
  999             'page count', fatal=True)
 1000         PAGE_SIZE = 12
 1001         title = self._html_search_regex(
 1002             r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
 1003         description = self._html_search_meta('description', webpage)
 1004 
 1005         def _get_page(idx):
 1006             page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % (
 1007                 user_id, idx + 1)
 1008             webpage = self._download_webpage(
 1009                 page_url, user_id,
 1010                 note='Downloading page %d/%d' % (idx + 1, page_count))
 1011             video_list = self._search_regex(
 1012                 r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
 1013                 webpage, 'video content')
 1014             paths = re.findall(
 1015                 r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
 1016             for path in paths:
 1017                 yield {
 1018                     '_type': 'url',
 1019                     'url': compat_urlparse.urljoin(page_url, path),
 1020                 }
 1021 
 1022         pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
 1023 
 1024         return {
 1025             '_type': 'playlist',
 1026             'id': 'user%s_likes' % user_id,
 1027             'title': title,
 1028             'description': description,
 1029             'entries': pl,
 1030         }

Generated by cgit