youtube_dl/extractor/radiocanada.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import re
    5 
    6 from .common import InfoExtractor
    7 from ..compat import compat_HTTPError
    8 from ..utils import (
    9     determine_ext,
   10     ExtractorError,
   11     int_or_none,
   12     unified_strdate,
   13 )
   14 
   15 
   16 class RadioCanadaIE(InfoExtractor):
   17     IE_NAME = 'radiocanada'
   18     _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
   19     _TESTS = [
   20         {
   21             'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
   22             'info_dict': {
   23                 'id': '7184272',
   24                 'ext': 'mp4',
   25                 'title': 'Le parcours du tireur capté sur vidéo',
   26                 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
   27                 'upload_date': '20141023',
   28             },
   29             'params': {
   30                 # m3u8 download
   31                 'skip_download': True,
   32             }
   33         },
   34         {
   35             # empty Title
   36             'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/',
   37             'info_dict': {
   38                 'id': '7754998',
   39                 'ext': 'mp4',
   40                 'title': 'letelejournal22h',
   41                 'description': 'INTEGRALE WEB 22H-TJ',
   42                 'upload_date': '20170720',
   43             },
   44             'params': {
   45                 # m3u8 download
   46                 'skip_download': True,
   47             },
   48         },
   49         {
   50             # with protectionType but not actually DRM protected
   51             'url': 'radiocanada:toutv:140872',
   52             'info_dict': {
   53                 'id': '140872',
   54                 'title': 'Épisode 1',
   55                 'series': 'District 31',
   56             },
   57             'only_matching': True,
   58         }
   59     ]
   60     _GEO_COUNTRIES = ['CA']
   61     _access_token = None
   62     _claims = None
   63 
   64     def _call_api(self, path, video_id=None, app_code=None, query=None):
   65         if not query:
   66             query = {}
   67         query.update({
   68             'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb',
   69             'output': 'json',
   70         })
   71         if video_id:
   72             query.update({
   73                 'appCode': app_code,
   74                 'idMedia': video_id,
   75             })
   76         if self._access_token:
   77             query['access_token'] = self._access_token
   78         try:
   79             return self._download_json(
   80                 'https://services.radio-canada.ca/media/' + path, video_id, query=query)
   81         except ExtractorError as e:
   82             if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422):
   83                 data = self._parse_json(e.cause.read().decode(), None)
   84                 error = data.get('error_description') or data['errorMessage']['text']
   85                 raise ExtractorError(error, expected=True)
   86             raise
   87 
   88     def _extract_info(self, app_code, video_id):
   89         metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas']
   90 
   91         def get_meta(name):
   92             for meta in metas:
   93                 if meta.get('name') == name:
   94                     text = meta.get('text')
   95                     if text:
   96                         return text
   97 
   98         # protectionType does not necessarily mean the video is DRM protected (see
   99         # https://github.com/ytdl-org/youtube-dl/pull/18609).
  100         if get_meta('protectionType'):
  101             self.report_warning('This video is probably DRM protected.')
  102 
  103         query = {
  104             'connectionType': 'hd',
  105             'deviceType': 'ipad',
  106             'multibitrate': 'true',
  107         }
  108         if self._claims:
  109             query['claims'] = self._claims
  110         v_data = self._call_api('validation/v2/', video_id, app_code, query)
  111         v_url = v_data.get('url')
  112         if not v_url:
  113             error = v_data['message']
  114             if error == "Le contenu sélectionné n'est pas disponible dans votre pays":
  115                 raise self.raise_geo_restricted(error, self._GEO_COUNTRIES)
  116             if error == 'Le contenu sélectionné est disponible seulement en premium':
  117                 self.raise_login_required(error)
  118             raise ExtractorError(
  119                 '%s said: %s' % (self.IE_NAME, error), expected=True)
  120         formats = self._extract_m3u8_formats(v_url, video_id, 'mp4')
  121         self._sort_formats(formats)
  122 
  123         subtitles = {}
  124         closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5')
  125         if closed_caption_url:
  126             subtitles['fr'] = [{
  127                 'url': closed_caption_url,
  128                 'ext': determine_ext(closed_caption_url, 'vtt'),
  129             }]
  130 
  131         return {
  132             'id': video_id,
  133             'title': get_meta('Title') or get_meta('AV-nomEmission'),
  134             'description': get_meta('Description') or get_meta('ShortDescription'),
  135             'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
  136             'duration': int_or_none(get_meta('length')),
  137             'series': get_meta('Emission'),
  138             'season_number': int_or_none('SrcSaison'),
  139             'episode_number': int_or_none('SrcEpisode'),
  140             'upload_date': unified_strdate(get_meta('Date')),
  141             'subtitles': subtitles,
  142             'formats': formats,
  143         }
  144 
  145     def _real_extract(self, url):
  146         return self._extract_info(*re.match(self._VALID_URL, url).groups())
  147 
  148 
  149 class RadioCanadaAudioVideoIE(InfoExtractor):
  150     IE_NAME = 'radiocanada:audiovideo'
  151     _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)'
  152     _TESTS = [{
  153         'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
  154         'info_dict': {
  155             'id': '7527184',
  156             'ext': 'mp4',
  157             'title': 'Barack Obama au Vietnam',
  158             'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
  159             'upload_date': '20160523',
  160         },
  161         'params': {
  162             # m3u8 download
  163             'skip_download': True,
  164         },
  165     }, {
  166         'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam',
  167         'only_matching': True,
  168     }]
  169 
  170     def _real_extract(self, url):
  171         return self.url_result('radiocanada:medianet:%s' % self._match_id(url))