youtube_dl/extractor/tvplay.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import re
    5 
    6 from .common import InfoExtractor
    7 from ..compat import (
    8     compat_HTTPError,
    9     compat_str,
   10     compat_urlparse,
   11 )
   12 from ..utils import (
   13     determine_ext,
   14     ExtractorError,
   15     int_or_none,
   16     parse_iso8601,
   17     qualities,
   18     try_get,
   19     update_url_query,
   20 )
   21 
   22 
   23 class TVPlayIE(InfoExtractor):
   24     IE_NAME = 'mtg'
   25     IE_DESC = 'MTG services'
   26     _VALID_URL = r'''(?x)
   27                     (?:
   28                         mtg:|
   29                         https?://
   30                             (?:www\.)?
   31                             (?:
   32                                 tvplay(?:\.skaties)?\.lv/parraides|
   33                                 (?:tv3play|play\.tv3)\.lt/programos|
   34                                 tv3play(?:\.tv3)?\.ee/sisu|
   35                                 (?:tv(?:3|6|8|10)play|viafree)\.se/program|
   36                                 (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer|
   37                                 play\.novatv\.bg/programi
   38                             )
   39                             /(?:[^/]+/)+
   40                         )
   41                         (?P<id>\d+)
   42                     '''
   43     _TESTS = [
   44         {
   45             'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true',
   46             'md5': 'a1612fe0849455423ad8718fe049be21',
   47             'info_dict': {
   48                 'id': '418113',
   49                 'ext': 'mp4',
   50                 'title': 'Kādi ir īri? - Viņas melo labāk',
   51                 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.',
   52                 'series': 'Viņas melo labāk',
   53                 'season': '2.sezona',
   54                 'season_number': 2,
   55                 'duration': 25,
   56                 'timestamp': 1406097056,
   57                 'upload_date': '20140723',
   58             },
   59         },
   60         {
   61             'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true',
   62             'info_dict': {
   63                 'id': '409229',
   64                 'ext': 'flv',
   65                 'title': 'Moterys meluoja geriau',
   66                 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e',
   67                 'series': 'Moterys meluoja geriau',
   68                 'episode_number': 47,
   69                 'season': '1 sezonas',
   70                 'season_number': 1,
   71                 'duration': 1330,
   72                 'timestamp': 1403769181,
   73                 'upload_date': '20140626',
   74             },
   75             'params': {
   76                 # rtmp download
   77                 'skip_download': True,
   78             },
   79         },
   80         {
   81             'url': 'http://www.tv3play.ee/sisu/kodu-keset-linna/238551?autostart=true',
   82             'info_dict': {
   83                 'id': '238551',
   84                 'ext': 'flv',
   85                 'title': 'Kodu keset linna 398537',
   86                 'description': 'md5:7df175e3c94db9e47c0d81ffa5d68701',
   87                 'duration': 1257,
   88                 'timestamp': 1292449761,
   89                 'upload_date': '20101215',
   90             },
   91             'params': {
   92                 # rtmp download
   93                 'skip_download': True,
   94             },
   95         },
   96         {
   97             'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true',
   98             'info_dict': {
   99                 'id': '395385',
  100                 'ext': 'mp4',
  101                 'title': 'Husräddarna S02E07',
  102                 'description': 'md5:f210c6c89f42d4fc39faa551be813777',
  103                 'duration': 2574,
  104                 'timestamp': 1400596321,
  105                 'upload_date': '20140520',
  106             },
  107             'params': {
  108                 'skip_download': True,
  109             },
  110         },
  111         {
  112             'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true',
  113             'info_dict': {
  114                 'id': '266636',
  115                 'ext': 'mp4',
  116                 'title': 'Den sista dokusåpan S01E08',
  117                 'description': 'md5:295be39c872520221b933830f660b110',
  118                 'duration': 1492,
  119                 'timestamp': 1330522854,
  120                 'upload_date': '20120229',
  121                 'age_limit': 18,
  122             },
  123             'params': {
  124                 'skip_download': True,
  125             },
  126         },
  127         {
  128             'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true',
  129             'info_dict': {
  130                 'id': '282756',
  131                 'ext': 'mp4',
  132                 'title': 'Antikjakten S01E10',
  133                 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8',
  134                 'duration': 2646,
  135                 'timestamp': 1348575868,
  136                 'upload_date': '20120925',
  137             },
  138             'params': {
  139                 'skip_download': True,
  140             },
  141         },
  142         {
  143             'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true',
  144             'info_dict': {
  145                 'id': '230898',
  146                 'ext': 'mp4',
  147                 'title': 'Anna Anka søker assistent - Ep. 8',
  148                 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474',
  149                 'duration': 2656,
  150                 'timestamp': 1277720005,
  151                 'upload_date': '20100628',
  152             },
  153             'params': {
  154                 'skip_download': True,
  155             },
  156         },
  157         {
  158             'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true',
  159             'info_dict': {
  160                 'id': '21873',
  161                 'ext': 'mp4',
  162                 'title': 'Budbringerne program 10',
  163                 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d',
  164                 'duration': 1297,
  165                 'timestamp': 1254205102,
  166                 'upload_date': '20090929',
  167             },
  168             'params': {
  169                 'skip_download': True,
  170             },
  171         },
  172         {
  173             'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true',
  174             'info_dict': {
  175                 'id': '361883',
  176                 'ext': 'mp4',
  177                 'title': 'Hotelinspektør Alex Polizzi - Ep. 10',
  178                 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81',
  179                 'duration': 2594,
  180                 'timestamp': 1393236292,
  181                 'upload_date': '20140224',
  182             },
  183             'params': {
  184                 'skip_download': True,
  185             },
  186         },
  187         {
  188             'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
  189             'info_dict': {
  190                 'id': '624952',
  191                 'ext': 'flv',
  192                 'title': 'Здравей, България (12.06.2015 г.) ',
  193                 'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
  194                 'duration': 8838,
  195                 'timestamp': 1434100372,
  196                 'upload_date': '20150612',
  197             },
  198             'params': {
  199                 # rtmp download
  200                 'skip_download': True,
  201             },
  202         },
  203         {
  204             'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true',
  205             'only_matching': True,
  206         },
  207         {
  208             # views is null
  209             'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183',
  210             'only_matching': True,
  211         },
  212         {
  213             'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true',
  214             'only_matching': True,
  215         },
  216         {
  217             'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869',
  218             'only_matching': True,
  219         },
  220         {
  221             'url': 'mtg:418113',
  222             'only_matching': True,
  223         }
  224     ]
  225 
  226     def _real_extract(self, url):
  227         video_id = self._match_id(url)
  228 
  229         video = self._download_json(
  230             'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
  231 
  232         title = video['title']
  233 
  234         try:
  235             streams = self._download_json(
  236                 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id,
  237                 video_id, 'Downloading streams JSON')
  238         except ExtractorError as e:
  239             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
  240                 msg = self._parse_json(e.cause.read().decode('utf-8'), video_id)
  241                 raise ExtractorError(msg['msg'], expected=True)
  242             raise
  243 
  244         quality = qualities(['hls', 'medium', 'high'])
  245         formats = []
  246         for format_id, video_url in streams.get('streams', {}).items():
  247             if not video_url or not isinstance(video_url, compat_str):
  248                 continue
  249             ext = determine_ext(video_url)
  250             if ext == 'f4m':
  251                 formats.extend(self._extract_f4m_formats(
  252                     update_url_query(video_url, {
  253                         'hdcore': '3.5.0',
  254                         'plugin': 'aasp-3.5.0.151.81'
  255                     }), video_id, f4m_id='hds', fatal=False))
  256             elif ext == 'm3u8':
  257                 formats.extend(self._extract_m3u8_formats(
  258                     video_url, video_id, 'mp4', 'm3u8_native',
  259                     m3u8_id='hls', fatal=False))
  260             else:
  261                 fmt = {
  262                     'format_id': format_id,
  263                     'quality': quality(format_id),
  264                     'ext': ext,
  265                 }
  266                 if video_url.startswith('rtmp'):
  267                     m = re.search(
  268                         r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url)
  269                     if not m:
  270                         continue
  271                     fmt.update({
  272                         'ext': 'flv',
  273                         'url': m.group('url'),
  274                         'app': m.group('app'),
  275                         'play_path': m.group('playpath'),
  276                     })
  277                 else:
  278                     fmt.update({
  279                         'url': video_url,
  280                     })
  281                 formats.append(fmt)
  282 
  283         if not formats and video.get('is_geo_blocked'):
  284             self.raise_geo_restricted(
  285                 'This content might not be available in your country due to copyright reasons')
  286 
  287         self._sort_formats(formats)
  288 
  289         # TODO: webvtt in m3u8
  290         subtitles = {}
  291         sami_path = video.get('sami_path')
  292         if sami_path:
  293             lang = self._search_regex(
  294                 r'_([a-z]{2})\.xml', sami_path, 'lang',
  295                 default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1])
  296             subtitles[lang] = [{
  297                 'url': sami_path,
  298             }]
  299 
  300         series = video.get('format_title')
  301         episode_number = int_or_none(video.get('format_position', {}).get('episode'))
  302         season = video.get('_embedded', {}).get('season', {}).get('title')
  303         season_number = int_or_none(video.get('format_position', {}).get('season'))
  304 
  305         return {
  306             'id': video_id,
  307             'title': title,
  308             'description': video.get('description'),
  309             'series': series,
  310             'episode_number': episode_number,
  311             'season': season,
  312             'season_number': season_number,
  313             'duration': int_or_none(video.get('duration')),
  314             'timestamp': parse_iso8601(video.get('created_at')),
  315             'view_count': try_get(video, lambda x: x['views']['total'], int),
  316             'age_limit': int_or_none(video.get('age_limit', 0)),
  317             'formats': formats,
  318             'subtitles': subtitles,
  319         }
  320 
  321 
  322 class ViafreeIE(InfoExtractor):
  323     _VALID_URL = r'''(?x)
  324                     https?://
  325                         (?:www\.)?
  326                         viafree\.
  327                         (?:
  328                             (?:dk|no)/programmer|
  329                             se/program
  330                         )
  331                         /(?:[^/]+/)+(?P<id>[^/?#&]+)
  332                     '''
  333     _TESTS = [{
  334         'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2',
  335         'info_dict': {
  336             'id': '395375',
  337             'ext': 'mp4',
  338             'title': 'Husräddarna S02E02',
  339             'description': 'md5:4db5c933e37db629b5a2f75dfb34829e',
  340             'series': 'Husräddarna',
  341             'season': 'Säsong 2',
  342             'season_number': 2,
  343             'duration': 2576,
  344             'timestamp': 1400596321,
  345             'upload_date': '20140520',
  346         },
  347         'params': {
  348             'skip_download': True,
  349         },
  350         'add_ie': [TVPlayIE.ie_key()],
  351     }, {
  352         # with relatedClips
  353         'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1',
  354         'info_dict': {
  355             'id': '758770',
  356             'ext': 'mp4',
  357             'title': 'Sommaren med YouTube-stjärnorna S01E01',
  358             'description': 'md5:2bc69dce2c4bb48391e858539bbb0e3f',
  359             'series': 'Sommaren med YouTube-stjärnorna',
  360             'season': 'Säsong 1',
  361             'season_number': 1,
  362             'duration': 1326,
  363             'timestamp': 1470905572,
  364             'upload_date': '20160811',
  365         },
  366         'params': {
  367             'skip_download': True,
  368         },
  369         'add_ie': [TVPlayIE.ie_key()],
  370     }, {
  371         # Different og:image URL schema
  372         'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2',
  373         'only_matching': True,
  374     }, {
  375         'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',
  376         'only_matching': True,
  377     }, {
  378         'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5',
  379         'only_matching': True,
  380     }]
  381 
  382     @classmethod
  383     def suitable(cls, url):
  384         return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url)
  385 
  386     def _real_extract(self, url):
  387         video_id = self._match_id(url)
  388 
  389         webpage = self._download_webpage(url, video_id)
  390 
  391         data = self._parse_json(
  392             self._search_regex(
  393                 r'(?s)window\.App\s*=\s*({.+?})\s*;\s*</script',
  394                 webpage, 'data', default='{}'),
  395             video_id, transform_source=lambda x: re.sub(
  396                 r'(?s)function\s+[a-zA-Z_][\da-zA-Z_]*\s*\([^)]*\)\s*{[^}]*}\s*',
  397                 'null', x), fatal=False)
  398 
  399         video_id = None
  400 
  401         if data:
  402             video_id = try_get(
  403                 data, lambda x: x['context']['dispatcher']['stores'][
  404                     'ContentPageProgramStore']['currentVideo']['id'],
  405                 compat_str)
  406 
  407         # Fallback #1 (extract from og:image URL schema)
  408         if not video_id:
  409             thumbnail = self._og_search_thumbnail(webpage, default=None)
  410             if thumbnail:
  411                 video_id = self._search_regex(
  412                     # Patterns seen:
  413                     #  http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/inbox/765166/a2e95e5f1d735bab9f309fa345cc3f25.jpg
  414                     #  http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/seasons/15204/758770/4a5ba509ca8bc043e1ebd1a76131cdf2.jpg
  415                     r'https?://[^/]+/imagecache/(?:[^/]+/)+(\d{6,})/',
  416                     thumbnail, 'video id', default=None)
  417 
  418         # Fallback #2. Extract from raw JSON string.
  419         # May extract wrong video id if relatedClips is present.
  420         if not video_id:
  421             video_id = self._search_regex(
  422                 r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})',
  423                 webpage, 'video id')
  424 
  425         return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key())