youtube_dl/extractor/rtve.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import base64
    5 import io
    6 import re
    7 import sys
    8 
    9 from .common import InfoExtractor
   10 from ..compat import (
   11     compat_b64decode,
   12     compat_struct_unpack,
   13 )
   14 from ..utils import (
   15     determine_ext,
   16     ExtractorError,
   17     float_or_none,
   18     qualities,
   19     remove_end,
   20     remove_start,
   21     std_headers,
   22 )
   23 
   24 _bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x))
   25 
   26 
   27 class RTVEALaCartaIE(InfoExtractor):
   28     IE_NAME = 'rtve.es:alacarta'
   29     IE_DESC = 'RTVE a la carta'
   30     _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?((alacarta|play)/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'
   31 
   32     _TESTS = [{
   33         'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
   34         'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
   35         'info_dict': {
   36             'id': '2491869',
   37             'ext': 'mp4',
   38             'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
   39             'duration': 5024.566,
   40             'series': 'Balonmano',
   41         },
   42         'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
   43     }, {
   44         'url': 'http://www.rtve.es/play/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
   45         'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
   46         'info_dict': {
   47             'id': '2491869',
   48             'ext': 'mp4',
   49             'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
   50             'duration': 5024.566,
   51             'series': 'Balonmano',
   52         },
   53         'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
   54     }, {
   55         'note': 'Live stream',
   56         'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/',
   57         'info_dict': {
   58             'id': '1694255',
   59             'ext': 'mp4',
   60             'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
   61             'is_live': True,
   62         },
   63         'params': {
   64             'skip_download': 'live stream',
   65         },
   66     }, {
   67         'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/',
   68         'md5': 'd850f3c8731ea53952ebab489cf81cbf',
   69         'info_dict': {
   70             'id': '4236788',
   71             'ext': 'mp4',
   72             'title': 'Servir y proteger - Capítulo 104',
   73             'duration': 3222.0,
   74         },
   75         'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
   76     }, {
   77         'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
   78         'only_matching': True,
   79     }, {
   80         'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/',
   81         'only_matching': True,
   82     }]
   83 
   84     def _real_initialize(self):
   85         user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
   86         self._manager = self._download_json(
   87             'http://www.rtve.es/odin/loki/' + user_agent_b64,
   88             None, 'Fetching manager info')['manager']
   89 
   90     @staticmethod
   91     def _decrypt_url(png):
   92         encrypted_data = io.BytesIO(compat_b64decode(png)[8:])
   93         while True:
   94             length = compat_struct_unpack('!I', encrypted_data.read(4))[0]
   95             chunk_type = encrypted_data.read(4)
   96             if chunk_type == b'IEND':
   97                 break
   98             data = encrypted_data.read(length)
   99             if chunk_type == b'tEXt':
  100                 alphabet_data, text = data.split(b'\0')
  101                 quality, url_data = text.split(b'%%')
  102                 alphabet = []
  103                 e = 0
  104                 d = 0
  105                 for l in _bytes_to_chr(alphabet_data):
  106                     if d == 0:
  107                         alphabet.append(l)
  108                         d = e = (e + 1) % 4
  109                     else:
  110                         d -= 1
  111                 url = ''
  112                 f = 0
  113                 e = 3
  114                 b = 1
  115                 for letter in _bytes_to_chr(url_data):
  116                     if f == 0:
  117                         l = int(letter) * 10
  118                         f = 1
  119                     else:
  120                         if e == 0:
  121                             l += int(letter)
  122                             url += alphabet[l]
  123                             e = (b + 3) % 4
  124                             f = 0
  125                             b += 1
  126                         else:
  127                             e -= 1
  128 
  129                 yield quality.decode(), url
  130             encrypted_data.read(4)  # CRC
  131 
  132     def _extract_png_formats(self, video_id):
  133         png = self._download_webpage(
  134             'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id),
  135             video_id, 'Downloading url information', query={'q': 'v2'})
  136         q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL'])
  137         formats = []
  138         for quality, video_url in self._decrypt_url(png):
  139             ext = determine_ext(video_url)
  140             if ext == 'm3u8':
  141                 formats.extend(self._extract_m3u8_formats(
  142                     video_url, video_id, 'mp4', 'm3u8_native',
  143                     m3u8_id='hls', fatal=False))
  144             elif ext == 'mpd':
  145                 formats.extend(self._extract_mpd_formats(
  146                     video_url, video_id, 'dash', fatal=False))
  147             else:
  148                 formats.append({
  149                     'format_id': quality,
  150                     'quality': q(quality),
  151                     'url': video_url,
  152                 })
  153         self._sort_formats(formats)
  154         return formats
  155 
  156     def _real_extract(self, url):
  157         video_id = self._match_id(url)
  158         info = self._download_json(
  159             'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
  160             video_id)['page']['items'][0]
  161         if info['state'] == 'DESPU':
  162             raise ExtractorError('The video is no longer available', expected=True)
  163         title = info['title'].strip()
  164         formats = self._extract_png_formats(video_id)
  165 
  166         subtitles = None
  167         sbt_file = info.get('sbtFile')
  168         if sbt_file:
  169             subtitles = self.extract_subtitles(video_id, sbt_file)
  170 
  171         is_live = info.get('live') is True
  172 
  173         return {
  174             'id': video_id,
  175             'title': self._live_title(title) if is_live else title,
  176             'formats': formats,
  177             'thumbnail': info.get('image'),
  178             'subtitles': subtitles,
  179             'duration': float_or_none(info.get('duration'), 1000),
  180             'is_live': is_live,
  181             'series': info.get('programTitle'),
  182         }
  183 
  184     def _get_subtitles(self, video_id, sub_file):
  185         subs = self._download_json(
  186             sub_file + '.json', video_id,
  187             'Downloading subtitles info')['page']['items']
  188         return dict(
  189             (s['lang'], [{'ext': 'vtt', 'url': s['src']}])
  190             for s in subs)
  191 
  192 
  193 class RTVEInfantilIE(RTVEALaCartaIE):
  194     IE_NAME = 'rtve.es:infantil'
  195     IE_DESC = 'RTVE infantil'
  196     _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/'
  197 
  198     _TESTS = [{
  199         'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/',
  200         'md5': '5747454717aedf9f9fdf212d1bcfc48d',
  201         'info_dict': {
  202             'id': '3040283',
  203             'ext': 'mp4',
  204             'title': 'Maneras de vivir',
  205             'thumbnail': r're:https?://.+/1426182947956\.JPG',
  206             'duration': 357.958,
  207         },
  208         'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  209     }]
  210 
  211 
  212 class RTVELiveIE(RTVEALaCartaIE):
  213     IE_NAME = 'rtve.es:live'
  214     IE_DESC = 'RTVE.es live streams'
  215     _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
  216 
  217     _TESTS = [{
  218         'url': 'http://www.rtve.es/directo/la-1/',
  219         'info_dict': {
  220             'id': 'la-1',
  221             'ext': 'mp4',
  222             'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  223         },
  224         'params': {
  225             'skip_download': 'live stream',
  226         }
  227     }]
  228 
  229     def _real_extract(self, url):
  230         mobj = re.match(self._VALID_URL, url)
  231         video_id = mobj.group('id')
  232 
  233         webpage = self._download_webpage(url, video_id)
  234         title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
  235         title = remove_start(title, 'Estoy viendo ')
  236 
  237         vidplayer_id = self._search_regex(
  238             (r'playerId=player([0-9]+)',
  239              r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)',
  240              r'data-id=["\'](\d+)'),
  241             webpage, 'internal video ID')
  242 
  243         return {
  244             'id': video_id,
  245             'title': self._live_title(title),
  246             'formats': self._extract_png_formats(vidplayer_id),
  247             'is_live': True,
  248         }
  249 
  250 
  251 class RTVETelevisionIE(InfoExtractor):
  252     IE_NAME = 'rtve.es:television'
  253     _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml'
  254 
  255     _TEST = {
  256         'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml',
  257         'info_dict': {
  258             'id': '3069778',
  259             'ext': 'mp4',
  260             'title': 'Documentos TV - La revolución del móvil',
  261             'duration': 3496.948,
  262         },
  263         'params': {
  264             'skip_download': True,
  265         },
  266     }
  267 
  268     def _real_extract(self, url):
  269         page_id = self._match_id(url)
  270         webpage = self._download_webpage(url, page_id)
  271 
  272         alacarta_url = self._search_regex(
  273             r'data-location="alacarta_videos"[^<]+url&quot;:&quot;(http://www\.rtve\.es/alacarta.+?)&',
  274             webpage, 'alacarta url', default=None)
  275         if alacarta_url is None:
  276             raise ExtractorError(
  277                 'The webpage doesn\'t contain any video', expected=True)
  278 
  279         return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key())