youtube_dl/extractor/spreaker.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import itertools
    5 
    6 from .common import InfoExtractor
    7 from ..compat import compat_str
    8 from ..utils import (
    9     float_or_none,
   10     int_or_none,
   11     str_or_none,
   12     try_get,
   13     unified_timestamp,
   14     url_or_none,
   15 )
   16 
   17 
   18 def _extract_episode(data, episode_id=None):
   19     title = data['title']
   20     download_url = data['download_url']
   21 
   22     series = try_get(data, lambda x: x['show']['title'], compat_str)
   23     uploader = try_get(data, lambda x: x['author']['fullname'], compat_str)
   24 
   25     thumbnails = []
   26     for image in ('image_original', 'image_medium', 'image'):
   27         image_url = url_or_none(data.get('%s_url' % image))
   28         if image_url:
   29             thumbnails.append({'url': image_url})
   30 
   31     def stats(key):
   32         return int_or_none(try_get(
   33             data,
   34             (lambda x: x['%ss_count' % key],
   35              lambda x: x['stats']['%ss' % key])))
   36 
   37     def duration(key):
   38         return float_or_none(data.get(key), scale=1000)
   39 
   40     return {
   41         'id': compat_str(episode_id or data['episode_id']),
   42         'url': download_url,
   43         'display_id': data.get('permalink'),
   44         'title': title,
   45         'description': data.get('description'),
   46         'timestamp': unified_timestamp(data.get('published_at')),
   47         'uploader': uploader,
   48         'uploader_id': str_or_none(data.get('author_id')),
   49         'creator': uploader,
   50         'duration': duration('duration') or duration('length'),
   51         'view_count': stats('play'),
   52         'like_count': stats('like'),
   53         'comment_count': stats('message'),
   54         'format': 'MPEG Layer 3',
   55         'format_id': 'mp3',
   56         'container': 'mp3',
   57         'ext': 'mp3',
   58         'thumbnails': thumbnails,
   59         'series': series,
   60         'extractor_key': SpreakerIE.ie_key(),
   61     }
   62 
   63 
   64 class SpreakerIE(InfoExtractor):
   65     _VALID_URL = r'''(?x)
   66                     https?://
   67                         api\.spreaker\.com/
   68                         (?:
   69                             (?:download/)?episode|
   70                             v2/episodes
   71                         )/
   72                         (?P<id>\d+)
   73                     '''
   74     _TESTS = [{
   75         'url': 'https://api.spreaker.com/episode/12534508',
   76         'info_dict': {
   77             'id': '12534508',
   78             'display_id': 'swm-ep15-how-to-market-your-music-part-2',
   79             'ext': 'mp3',
   80             'title': 'EP:15 | Music Marketing (Likes) - Part 2',
   81             'description': 'md5:0588c43e27be46423e183076fa071177',
   82             'timestamp': 1502250336,
   83             'upload_date': '20170809',
   84             'uploader': 'SWM',
   85             'uploader_id': '9780658',
   86             'duration': 1063.42,
   87             'view_count': int,
   88             'like_count': int,
   89             'comment_count': int,
   90             'series': 'Success With Music (SWM)',
   91         },
   92     }, {
   93         'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3',
   94         'only_matching': True,
   95     }, {
   96         'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments',
   97         'only_matching': True,
   98     }]
   99 
  100     def _real_extract(self, url):
  101         episode_id = self._match_id(url)
  102         data = self._download_json(
  103             'https://api.spreaker.com/v2/episodes/%s' % episode_id,
  104             episode_id)['response']['episode']
  105         return _extract_episode(data, episode_id)
  106 
  107 
  108 class SpreakerPageIE(InfoExtractor):
  109     _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)'
  110     _TESTS = [{
  111         'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2',
  112         'only_matching': True,
  113     }]
  114 
  115     def _real_extract(self, url):
  116         display_id = self._match_id(url)
  117         webpage = self._download_webpage(url, display_id)
  118         episode_id = self._search_regex(
  119             (r'data-episode_id=["\'](?P<id>\d+)',
  120              r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id')
  121         return self.url_result(
  122             'https://api.spreaker.com/episode/%s' % episode_id,
  123             ie=SpreakerIE.ie_key(), video_id=episode_id)
  124 
  125 
  126 class SpreakerShowIE(InfoExtractor):
  127     _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)'
  128     _TESTS = [{
  129         'url': 'https://api.spreaker.com/show/4652058',
  130         'info_dict': {
  131             'id': '4652058',
  132         },
  133         'playlist_mincount': 118,
  134     }]
  135 
  136     def _entries(self, show_id):
  137         for page_num in itertools.count(1):
  138             episodes = self._download_json(
  139                 'https://api.spreaker.com/show/%s/episodes' % show_id,
  140                 show_id, note='Downloading JSON page %d' % page_num, query={
  141                     'page': page_num,
  142                     'max_per_page': 100,
  143                 })
  144             pager = try_get(episodes, lambda x: x['response']['pager'], dict)
  145             if not pager:
  146                 break
  147             results = pager.get('results')
  148             if not results or not isinstance(results, list):
  149                 break
  150             for result in results:
  151                 if not isinstance(result, dict):
  152                     continue
  153                 yield _extract_episode(result)
  154             if page_num == pager.get('last_page'):
  155                 break
  156 
  157     def _real_extract(self, url):
  158         show_id = self._match_id(url)
  159         return self.playlist_result(self._entries(show_id), playlist_id=show_id)
  160 
  161 
  162 class SpreakerShowPageIE(InfoExtractor):
  163     _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)'
  164     _TESTS = [{
  165         'url': 'https://www.spreaker.com/show/success-with-music',
  166         'only_matching': True,
  167     }]
  168 
  169     def _real_extract(self, url):
  170         display_id = self._match_id(url)
  171         webpage = self._download_webpage(url, display_id)
  172         show_id = self._search_regex(
  173             r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id')
  174         return self.url_result(
  175             'https://api.spreaker.com/show/%s' % show_id,
  176             ie=SpreakerShowIE.ie_key(), video_id=show_id)