youtube_dl/extractor/mixcloud.py



    1 from __future__ import unicode_literals
    2 
    3 import base64
    4 import functools
    5 import itertools
    6 import re
    7 
    8 from .common import InfoExtractor
    9 from ..compat import (
   10     compat_chr,
   11     compat_ord,
   12     compat_urllib_parse_unquote,
   13     compat_urlparse,
   14 )
   15 from ..utils import (
   16     clean_html,
   17     ExtractorError,
   18     OnDemandPagedList,
   19     parse_count,
   20     str_to_int,
   21 )
   22 
   23 
   24 class MixcloudIE(InfoExtractor):
   25     _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
   26     IE_NAME = 'mixcloud'
   27 
   28     _TESTS = [{
   29         'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
   30         'info_dict': {
   31             'id': 'dholbach-cryptkeeper',
   32             'ext': 'm4a',
   33             'title': 'Cryptkeeper',
   34             'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
   35             'uploader': 'Daniel Holbach',
   36             'uploader_id': 'dholbach',
   37             'thumbnail': r're:https?://.*\.jpg',
   38             'view_count': int,
   39             'like_count': int,
   40         },
   41     }, {
   42         'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
   43         'info_dict': {
   44             'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
   45             'ext': 'mp3',
   46             'title': 'Caribou 7 inch Vinyl Mix & Chat',
   47             'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
   48             'uploader': 'Gilles Peterson Worldwide',
   49             'uploader_id': 'gillespeterson',
   50             'thumbnail': 're:https?://.*',
   51             'view_count': int,
   52             'like_count': int,
   53         },
   54     }, {
   55         'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
   56         'only_matching': True,
   57     }]
   58 
   59     # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
   60     @staticmethod
   61     def _decrypt_play_info(play_info):
   62         KEY = 'pleasedontdownloadourmusictheartistswontgetpaid'
   63 
   64         play_info = base64.b64decode(play_info.encode('ascii'))
   65 
   66         return ''.join([
   67             compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)]))
   68             for idx, ch in enumerate(play_info)])
   69 
   70     def _real_extract(self, url):
   71         mobj = re.match(self._VALID_URL, url)
   72         uploader = mobj.group(1)
   73         cloudcast_name = mobj.group(2)
   74         track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name)))
   75 
   76         webpage = self._download_webpage(url, track_id)
   77 
   78         message = self._html_search_regex(
   79             r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
   80             webpage, 'error message', default=None)
   81 
   82         encrypted_play_info = self._search_regex(
   83             r'm-play-info="([^"]+)"', webpage, 'play info')
   84         play_info = self._parse_json(
   85             self._decrypt_play_info(encrypted_play_info), track_id)
   86 
   87         if message and 'stream_url' not in play_info:
   88             raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
   89 
   90         song_url = play_info['stream_url']
   91 
   92         PREFIX = (
   93             r'm-play-on-spacebar[^>]+'
   94             r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
   95         title = self._html_search_regex(
   96             PREFIX + r'm-title="([^"]+)"', webpage, 'title')
   97         thumbnail = self._proto_relative_url(self._html_search_regex(
   98             PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail',
   99             fatal=False))
  100         uploader = self._html_search_regex(
  101             PREFIX + r'm-owner-name="([^"]+)"',
  102             webpage, 'uploader', fatal=False)
  103         uploader_id = self._search_regex(
  104             r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
  105         description = self._og_search_description(webpage)
  106         like_count = parse_count(self._search_regex(
  107             r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
  108             webpage, 'like count', default=None))
  109         view_count = str_to_int(self._search_regex(
  110             [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
  111              r'/listeners/?">([0-9,.]+)</a>'],
  112             webpage, 'play count', default=None))
  113 
  114         return {
  115             'id': track_id,
  116             'title': title,
  117             'url': song_url,
  118             'description': description,
  119             'thumbnail': thumbnail,
  120             'uploader': uploader,
  121             'uploader_id': uploader_id,
  122             'view_count': view_count,
  123             'like_count': like_count,
  124         }
  125 
  126 
  127 class MixcloudPlaylistBaseIE(InfoExtractor):
  128     _PAGE_SIZE = 24
  129 
  130     def _find_urls_in_page(self, page):
  131         for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page):
  132             yield self.url_result(
  133                 compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)),
  134                 MixcloudIE.ie_key())
  135 
  136     def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None):
  137         real_page_number = real_page_number or current_page + 1
  138         return self._download_webpage(
  139             'https://www.mixcloud.com/%s/' % path, video_id,
  140             note='Download %s (page %d)' % (page_name, current_page + 1),
  141             errnote='Unable to download %s' % page_name,
  142             query={'page': real_page_number, 'list': 'main', '_ajax': '1'},
  143             headers={'X-Requested-With': 'XMLHttpRequest'})
  144 
  145     def _tracks_page_func(self, page, video_id, page_name, current_page):
  146         resp = self._fetch_tracks_page(page, video_id, page_name, current_page)
  147 
  148         for item in self._find_urls_in_page(resp):
  149             yield item
  150 
  151     def _get_user_description(self, page_content):
  152         return self._html_search_regex(
  153             r'<div[^>]+class="description-text"[^>]*>(.+?)</div>',
  154             page_content, 'user description', fatal=False)
  155 
  156 
  157 class MixcloudUserIE(MixcloudPlaylistBaseIE):
  158     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
  159     IE_NAME = 'mixcloud:user'
  160 
  161     _TESTS = [{
  162         'url': 'http://www.mixcloud.com/dholbach/',
  163         'info_dict': {
  164             'id': 'dholbach_uploads',
  165             'title': 'Daniel Holbach (uploads)',
  166             'description': 'md5:327af72d1efeb404a8216c27240d1370',
  167         },
  168         'playlist_mincount': 11,
  169     }, {
  170         'url': 'http://www.mixcloud.com/dholbach/uploads/',
  171         'info_dict': {
  172             'id': 'dholbach_uploads',
  173             'title': 'Daniel Holbach (uploads)',
  174             'description': 'md5:327af72d1efeb404a8216c27240d1370',
  175         },
  176         'playlist_mincount': 11,
  177     }, {
  178         'url': 'http://www.mixcloud.com/dholbach/favorites/',
  179         'info_dict': {
  180             'id': 'dholbach_favorites',
  181             'title': 'Daniel Holbach (favorites)',
  182             'description': 'md5:327af72d1efeb404a8216c27240d1370',
  183         },
  184         'params': {
  185             'playlist_items': '1-100',
  186         },
  187         'playlist_mincount': 100,
  188     }, {
  189         'url': 'http://www.mixcloud.com/dholbach/listens/',
  190         'info_dict': {
  191             'id': 'dholbach_listens',
  192             'title': 'Daniel Holbach (listens)',
  193             'description': 'md5:327af72d1efeb404a8216c27240d1370',
  194         },
  195         'params': {
  196             'playlist_items': '1-100',
  197         },
  198         'playlist_mincount': 100,
  199     }]
  200 
  201     def _real_extract(self, url):
  202         mobj = re.match(self._VALID_URL, url)
  203         user_id = mobj.group('user')
  204         list_type = mobj.group('type')
  205 
  206         # if only a profile URL was supplied, default to download all uploads
  207         if list_type is None:
  208             list_type = 'uploads'
  209 
  210         video_id = '%s_%s' % (user_id, list_type)
  211 
  212         profile = self._download_webpage(
  213             'https://www.mixcloud.com/%s/' % user_id, video_id,
  214             note='Downloading user profile',
  215             errnote='Unable to download user profile')
  216 
  217         username = self._og_search_title(profile)
  218         description = self._get_user_description(profile)
  219 
  220         entries = OnDemandPagedList(
  221             functools.partial(
  222                 self._tracks_page_func,
  223                 '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type),
  224             self._PAGE_SIZE, use_cache=True)
  225 
  226         return self.playlist_result(
  227             entries, video_id, '%s (%s)' % (username, list_type), description)
  228 
  229 
  230 class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
  231     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
  232     IE_NAME = 'mixcloud:playlist'
  233 
  234     _TESTS = [{
  235         'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/',
  236         'info_dict': {
  237             'id': 'RedBullThre3style_tokyo-finalists-2015',
  238             'title': 'National Champions 2015',
  239             'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3',
  240         },
  241         'playlist_mincount': 16,
  242     }, {
  243         'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
  244         'info_dict': {
  245             'id': 'maxvibes_jazzcat-on-ness-radio',
  246             'title': 'Jazzcat on Ness Radio',
  247             'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263',
  248         },
  249         'playlist_mincount': 23
  250     }]
  251 
  252     def _real_extract(self, url):
  253         mobj = re.match(self._VALID_URL, url)
  254         user_id = mobj.group('user')
  255         playlist_id = mobj.group('playlist')
  256         video_id = '%s_%s' % (user_id, playlist_id)
  257 
  258         profile = self._download_webpage(
  259             url, user_id,
  260             note='Downloading playlist page',
  261             errnote='Unable to download playlist page')
  262 
  263         description = self._get_user_description(profile)
  264         playlist_title = self._html_search_regex(
  265             r'<span[^>]+class="[^"]*list-playlist-title[^"]*"[^>]*>(.*?)</span>',
  266             profile, 'playlist title')
  267 
  268         entries = OnDemandPagedList(
  269             functools.partial(
  270                 self._tracks_page_func,
  271                 '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'),
  272             self._PAGE_SIZE)
  273 
  274         return self.playlist_result(entries, video_id, playlist_title, description)
  275 
  276 
  277 class MixcloudStreamIE(MixcloudPlaylistBaseIE):
  278     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
  279     IE_NAME = 'mixcloud:stream'
  280 
  281     _TEST = {
  282         'url': 'https://www.mixcloud.com/FirstEar/stream/',
  283         'info_dict': {
  284             'id': 'FirstEar',
  285             'title': 'First Ear',
  286             'description': 'Curators of good music\nfirstearmusic.com',
  287         },
  288         'playlist_mincount': 192,
  289     }
  290 
  291     def _real_extract(self, url):
  292         user_id = self._match_id(url)
  293 
  294         webpage = self._download_webpage(url, user_id)
  295 
  296         entries = []
  297         prev_page_url = None
  298 
  299         def _handle_page(page):
  300             entries.extend(self._find_urls_in_page(page))
  301             return self._search_regex(
  302                 r'm-next-page-url="([^"]+)"', page,
  303                 'next page URL', default=None)
  304 
  305         next_page_url = _handle_page(webpage)
  306 
  307         for idx in itertools.count(0):
  308             if not next_page_url or prev_page_url == next_page_url:
  309                 break
  310 
  311             prev_page_url = next_page_url
  312             current_page = int(self._search_regex(
  313                 r'\?page=(\d+)', next_page_url, 'next page number'))
  314 
  315             next_page_url = _handle_page(self._fetch_tracks_page(
  316                 '%s/stream' % user_id, user_id, 'stream', idx,
  317                 real_page_number=current_page))
  318 
  319         username = self._og_search_title(webpage)
  320         description = self._get_user_description(webpage)
  321 
  322         return self.playlist_result(entries, user_id, username, description)