youtube_dl/extractor/francetv.py



    1 # coding: utf-8
    2 
    3 from __future__ import unicode_literals
    4 
    5 import re
    6 
    7 from .common import InfoExtractor
    8 from ..compat import (
    9     compat_str,
   10     compat_urlparse,
   11 )
   12 from ..utils import (
   13     clean_html,
   14     determine_ext,
   15     ExtractorError,
   16     int_or_none,
   17     parse_duration,
   18     try_get,
   19     url_or_none,
   20     urljoin,
   21 )
   22 from .dailymotion import DailymotionIE
   23 
   24 
   25 class FranceTVBaseInfoExtractor(InfoExtractor):
   26     def _make_url_result(self, video_or_full_id, catalog=None):
   27         full_id = 'francetv:%s' % video_or_full_id
   28         if '@' not in video_or_full_id and catalog:
   29             full_id += '@%s' % catalog
   30         return self.url_result(
   31             full_id, ie=FranceTVIE.ie_key(),
   32             video_id=video_or_full_id.split('@')[0])
   33 
   34 
   35 class FranceTVIE(InfoExtractor):
   36     _VALID_URL = r'''(?x)
   37                     (?:
   38                         https?://
   39                             sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\?
   40                             .*?\bidDiffusion=[^&]+|
   41                         (?:
   42                             https?://videos\.francetv\.fr/video/|
   43                             francetv:
   44                         )
   45                         (?P<id>[^@]+)(?:@(?P<catalog>.+))?
   46                     )
   47                     '''
   48 
   49     _TESTS = [{
   50         # without catalog
   51         'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0',
   52         'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f',
   53         'info_dict': {
   54             'id': '162311093',
   55             'ext': 'mp4',
   56             'title': '13h15, le dimanche... - Les mystères de Jésus',
   57             'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
   58             'timestamp': 1502623500,
   59             'upload_date': '20170813',
   60         },
   61     }, {
   62         # with catalog
   63         'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4',
   64         'only_matching': True,
   65     }, {
   66         'url': 'http://videos.francetv.fr/video/NI_657393@Regions',
   67         'only_matching': True,
   68     }, {
   69         'url': 'francetv:162311093',
   70         'only_matching': True,
   71     }, {
   72         'url': 'francetv:NI_1004933@Zouzous',
   73         'only_matching': True,
   74     }, {
   75         'url': 'francetv:NI_983319@Info-web',
   76         'only_matching': True,
   77     }, {
   78         'url': 'francetv:NI_983319',
   79         'only_matching': True,
   80     }, {
   81         'url': 'francetv:NI_657393@Regions',
   82         'only_matching': True,
   83     }, {
   84         # france-3 live
   85         'url': 'francetv:SIM_France3',
   86         'only_matching': True,
   87     }]
   88 
   89     def _extract_video(self, video_id, catalogue=None):
   90         # Videos are identified by idDiffusion so catalogue part is optional.
   91         # However when provided, some extra formats may be returned so we pass
   92         # it if available.
   93         info = self._download_json(
   94             'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
   95             video_id, 'Downloading video JSON', query={
   96                 'idDiffusion': video_id,
   97                 'catalogue': catalogue or '',
   98             })
   99 
  100         if info.get('status') == 'NOK':
  101             raise ExtractorError(
  102                 '%s returned error: %s' % (self.IE_NAME, info['message']),
  103                 expected=True)
  104         allowed_countries = info['videos'][0].get('geoblocage')
  105         if allowed_countries:
  106             georestricted = True
  107             geo_info = self._download_json(
  108                 'http://geo.francetv.fr/ws/edgescape.json', video_id,
  109                 'Downloading geo restriction info')
  110             country = geo_info['reponse']['geo_info']['country_code']
  111             if country not in allowed_countries:
  112                 raise ExtractorError(
  113                     'The video is not available from your location',
  114                     expected=True)
  115         else:
  116             georestricted = False
  117 
  118         def sign(manifest_url, manifest_id):
  119             for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
  120                 signed_url = url_or_none(self._download_webpage(
  121                     'https://%s/esi/TA' % host, video_id,
  122                     'Downloading signed %s manifest URL' % manifest_id,
  123                     fatal=False, query={
  124                         'url': manifest_url,
  125                     }))
  126                 if signed_url:
  127                     return signed_url
  128             return manifest_url
  129 
  130         is_live = None
  131 
  132         videos = []
  133 
  134         for video in (info.get('videos') or []):
  135             if video.get('statut') != 'ONLINE':
  136                 continue
  137             if not video.get('url'):
  138                 continue
  139             videos.append(video)
  140 
  141         if not videos:
  142             for device_type in ['desktop', 'mobile']:
  143                 fallback_info = self._download_json(
  144                     'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
  145                     video_id, 'Downloading fallback %s video JSON' % device_type, query={
  146                         'device_type': device_type,
  147                         'browser': 'chrome',
  148                     }, fatal=False)
  149 
  150                 if fallback_info and fallback_info.get('video'):
  151                     videos.append(fallback_info['video'])
  152 
  153         formats = []
  154         for video in videos:
  155             video_url = video.get('url')
  156             if not video_url:
  157                 continue
  158             if is_live is None:
  159                 is_live = (try_get(
  160                     video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
  161                     or video.get('is_live') is True
  162                     or '/live.francetv.fr/' in video_url)
  163             format_id = video.get('format')
  164             ext = determine_ext(video_url)
  165             if ext == 'f4m':
  166                 if georestricted:
  167                     # See https://github.com/ytdl-org/youtube-dl/issues/3963
  168                     # m3u8 urls work fine
  169                     continue
  170                 formats.extend(self._extract_f4m_formats(
  171                     sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
  172                     video_id, f4m_id=format_id, fatal=False))
  173             elif ext == 'm3u8':
  174                 formats.extend(self._extract_m3u8_formats(
  175                     sign(video_url, format_id), video_id, 'mp4',
  176                     entry_protocol='m3u8_native', m3u8_id=format_id,
  177                     fatal=False))
  178             elif ext == 'mpd':
  179                 formats.extend(self._extract_mpd_formats(
  180                     sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
  181             elif video_url.startswith('rtmp'):
  182                 formats.append({
  183                     'url': video_url,
  184                     'format_id': 'rtmp-%s' % format_id,
  185                     'ext': 'flv',
  186                 })
  187             else:
  188                 if self._is_valid_url(video_url, video_id, format_id):
  189                     formats.append({
  190                         'url': video_url,
  191                         'format_id': format_id,
  192                     })
  193 
  194         self._sort_formats(formats)
  195 
  196         title = info['titre']
  197         subtitle = info.get('sous_titre')
  198         if subtitle:
  199             title += ' - %s' % subtitle
  200         title = title.strip()
  201 
  202         subtitles = {}
  203         subtitles_list = [{
  204             'url': subformat['url'],
  205             'ext': subformat.get('format'),
  206         } for subformat in info.get('subtitles', []) if subformat.get('url')]
  207         if subtitles_list:
  208             subtitles['fr'] = subtitles_list
  209 
  210         return {
  211             'id': video_id,
  212             'title': self._live_title(title) if is_live else title,
  213             'description': clean_html(info.get('synopsis')),
  214             'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')),
  215             'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
  216             'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
  217             'is_live': is_live,
  218             'formats': formats,
  219             'subtitles': subtitles,
  220         }
  221 
  222     def _real_extract(self, url):
  223         mobj = re.match(self._VALID_URL, url)
  224         video_id = mobj.group('id')
  225         catalog = mobj.group('catalog')
  226 
  227         if not video_id:
  228             qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
  229             video_id = qs.get('idDiffusion', [None])[0]
  230             catalog = qs.get('catalogue', [None])[0]
  231             if not video_id:
  232                 raise ExtractorError('Invalid URL', expected=True)
  233 
  234         return self._extract_video(video_id, catalog)
  235 
  236 
  237 class FranceTVSiteIE(FranceTVBaseInfoExtractor):
  238     _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html'
  239 
  240     _TESTS = [{
  241         'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
  242         'info_dict': {
  243             'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
  244             'ext': 'mp4',
  245             'title': '13h15, le dimanche... - Les mystères de Jésus',
  246             'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
  247             'timestamp': 1502623500,
  248             'upload_date': '20170813',
  249         },
  250         'params': {
  251             'skip_download': True,
  252         },
  253         'add_ie': [FranceTVIE.ie_key()],
  254     }, {
  255         # france3
  256         'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
  257         'only_matching': True,
  258     }, {
  259         # france4
  260         'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html',
  261         'only_matching': True,
  262     }, {
  263         # france5
  264         'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html',
  265         'only_matching': True,
  266     }, {
  267         # franceo
  268         'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html',
  269         'only_matching': True,
  270     }, {
  271         # france2 live
  272         'url': 'https://www.france.tv/france-2/direct.html',
  273         'only_matching': True,
  274     }, {
  275         'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html',
  276         'only_matching': True,
  277     }, {
  278         'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html',
  279         'only_matching': True,
  280     }, {
  281         'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html',
  282         'only_matching': True,
  283     }, {
  284         'url': 'https://www.france.tv/142749-rouge-sang.html',
  285         'only_matching': True,
  286     }, {
  287         # france-3 live
  288         'url': 'https://www.france.tv/france-3/direct.html',
  289         'only_matching': True,
  290     }]
  291 
  292     def _real_extract(self, url):
  293         display_id = self._match_id(url)
  294 
  295         webpage = self._download_webpage(url, display_id)
  296 
  297         catalogue = None
  298         video_id = self._search_regex(
  299             r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
  300             webpage, 'video id', default=None, group='id')
  301 
  302         if not video_id:
  303             video_id, catalogue = self._html_search_regex(
  304                 r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
  305                 webpage, 'video ID').split('@')
  306 
  307         return self._make_url_result(video_id, catalogue)
  308 
  309 
  310 class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
  311     _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
  312 
  313     _TESTS = [{
  314         'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
  315         'info_dict': {
  316             'id': 'NI_983319',
  317             'ext': 'mp4',
  318             'title': 'Le Pen Reims',
  319             'upload_date': '20170505',
  320             'timestamp': 1493981780,
  321             'duration': 16,
  322         },
  323         'params': {
  324             'skip_download': True,
  325         },
  326         'add_ie': [FranceTVIE.ie_key()],
  327     }]
  328 
  329     def _real_extract(self, url):
  330         video_id = self._match_id(url)
  331 
  332         video = self._download_json(
  333             'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
  334             video_id)
  335 
  336         return self._make_url_result(video['video_id'], video.get('catalog'))
  337 
  338 
  339 class FranceTVInfoIE(FranceTVBaseInfoExtractor):
  340     IE_NAME = 'francetvinfo.fr'
  341     _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)'
  342 
  343     _TESTS = [{
  344         'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
  345         'info_dict': {
  346             'id': '84981923',
  347             'ext': 'mp4',
  348             'title': 'Soir 3',
  349             'upload_date': '20130826',
  350             'timestamp': 1377548400,
  351             'subtitles': {
  352                 'fr': 'mincount:2',
  353             },
  354         },
  355         'params': {
  356             'skip_download': True,
  357         },
  358         'add_ie': [FranceTVIE.ie_key()],
  359     }, {
  360         'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
  361         'only_matching': True,
  362     }, {
  363         'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
  364         'only_matching': True,
  365     }, {
  366         'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html',
  367         'only_matching': True,
  368     }, {
  369         # Dailymotion embed
  370         'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html',
  371         'md5': 'ee7f1828f25a648addc90cb2687b1f12',
  372         'info_dict': {
  373             'id': 'x4iiko0',
  374             'ext': 'mp4',
  375             'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
  376             'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
  377             'timestamp': 1467011958,
  378             'upload_date': '20160627',
  379             'uploader': 'France Inter',
  380             'uploader_id': 'x2q2ez',
  381         },
  382         'add_ie': ['Dailymotion'],
  383     }, {
  384         'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin',
  385         'only_matching': True,
  386     }, {
  387         # "<figure id=" pattern (#28792)
  388         'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html',
  389         'only_matching': True,
  390     }]
  391 
  392     def _real_extract(self, url):
  393         display_id = self._match_id(url)
  394 
  395         webpage = self._download_webpage(url, display_id)
  396 
  397         dailymotion_urls = DailymotionIE._extract_urls(webpage)
  398         if dailymotion_urls:
  399             return self.playlist_result([
  400                 self.url_result(dailymotion_url, DailymotionIE.ie_key())
  401                 for dailymotion_url in dailymotion_urls])
  402 
  403         video_id = self._search_regex(
  404             (r'player\.load[^;]+src:\s*["\']([^"\']+)',
  405              r'id-video=([^@]+@[^"]+)',
  406              r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"',
  407              r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'),
  408             webpage, 'video id')
  409 
  410         return self._make_url_result(video_id)
  411 
  412 
  413 class FranceTVInfoSportIE(FranceTVBaseInfoExtractor):
  414     IE_NAME = 'sport.francetvinfo.fr'
  415     _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  416     _TESTS = [{
  417         'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018',
  418         'info_dict': {
  419             'id': '6e49080e-3f45-11e8-b459-000d3a2439ea',
  420             'ext': 'mp4',
  421             'title': 'Retour sur les meilleurs moments de Pyeongchang 2018',
  422             'timestamp': 1523639962,
  423             'upload_date': '20180413',
  424         },
  425         'params': {
  426             'skip_download': True,
  427         },
  428         'add_ie': [FranceTVIE.ie_key()],
  429     }]
  430 
  431     def _real_extract(self, url):
  432         display_id = self._match_id(url)
  433         webpage = self._download_webpage(url, display_id)
  434         video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id')
  435         return self._make_url_result(video_id, 'Sport-web')
  436 
  437 
  438 class GenerationWhatIE(InfoExtractor):
  439     IE_NAME = 'france2.fr:generation-what'
  440     _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)'
  441 
  442     _TESTS = [{
  443         'url': 'http://generation-what.francetv.fr/portrait/video/present-arms',
  444         'info_dict': {
  445             'id': 'wtvKYUG45iw',
  446             'ext': 'mp4',
  447             'title': 'Generation What - Garde à vous - FRA',
  448             'uploader': 'Generation What',
  449             'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w',
  450             'upload_date': '20160411',
  451         },
  452         'params': {
  453             'skip_download': True,
  454         },
  455         'add_ie': ['Youtube'],
  456     }, {
  457         'url': 'http://generation-what.francetv.fr/europe/video/present-arms',
  458         'only_matching': True,
  459     }]
  460 
  461     def _real_extract(self, url):
  462         display_id = self._match_id(url)
  463 
  464         webpage = self._download_webpage(url, display_id)
  465 
  466         youtube_id = self._search_regex(
  467             r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';",
  468             webpage, 'youtube id')
  469 
  470         return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id)
  471 
  472 
  473 class CultureboxIE(FranceTVBaseInfoExtractor):
  474     _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  475 
  476     _TESTS = [{
  477         'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689',
  478         'info_dict': {
  479             'id': 'EV_134885',
  480             'ext': 'mp4',
  481             'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7',
  482             'description': 'md5:19c44af004b88219f4daa50fa9a351d4',
  483             'upload_date': '20180206',
  484             'timestamp': 1517945220,
  485             'duration': 5981,
  486         },
  487         'params': {
  488             'skip_download': True,
  489         },
  490         'add_ie': [FranceTVIE.ie_key()],
  491     }]
  492 
  493     def _real_extract(self, url):
  494         display_id = self._match_id(url)
  495 
  496         webpage = self._download_webpage(url, display_id)
  497 
  498         if ">Ce live n'est plus disponible en replay<" in webpage:
  499             raise ExtractorError(
  500                 'Video %s is not available' % display_id, expected=True)
  501 
  502         video_id, catalogue = self._search_regex(
  503             r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]',
  504             webpage, 'video id').split('@')
  505 
  506         return self._make_url_result(video_id, catalogue)
  507 
  508 
  509 class FranceTVJeunesseIE(FranceTVBaseInfoExtractor):
  510     _VALID_URL = r'(?P<url>https?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P<id>[^/?#&]+))'
  511 
  512     _TESTS = [{
  513         'url': 'https://www.zouzous.fr/heros/simon',
  514         'info_dict': {
  515             'id': 'simon',
  516         },
  517         'playlist_count': 9,
  518     }, {
  519         'url': 'https://www.ludo.fr/heros/ninjago',
  520         'info_dict': {
  521             'id': 'ninjago',
  522         },
  523         'playlist_count': 10,
  524     }, {
  525         'url': 'https://www.zouzous.fr/heros/simon?abc',
  526         'only_matching': True,
  527     }]
  528 
  529     def _real_extract(self, url):
  530         mobj = re.match(self._VALID_URL, url)
  531         playlist_id = mobj.group('id')
  532 
  533         playlist = self._download_json(
  534             '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id)
  535 
  536         if not playlist.get('count'):
  537             raise ExtractorError(
  538                 '%s is not available' % playlist_id, expected=True)
  539 
  540         entries = []
  541         for item in playlist['items']:
  542             identity = item.get('identity')
  543             if identity and isinstance(identity, compat_str):
  544                 entries.append(self._make_url_result(identity))
  545 
  546         return self.playlist_result(entries, playlist_id)