youtube_dl/extractor/bilibili.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import hashlib
    5 import re
    6 
    7 from .common import InfoExtractor
    8 from ..compat import compat_parse_qs
    9 from ..utils import (
   10     int_or_none,
   11     float_or_none,
   12     unified_timestamp,
   13     urlencode_postdata,
   14 )
   15 
   16 
   17 class BiliBiliIE(InfoExtractor):
   18     _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/v/)(?P<id>\d+)'
   19 
   20     _TEST = {
   21         'url': 'http://www.bilibili.tv/video/av1074402/',
   22         'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e',
   23         'info_dict': {
   24             'id': '1074402',
   25             'ext': 'mp4',
   26             'title': '【金坷垃】金泡沫',
   27             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
   28             'duration': 308.315,
   29             'timestamp': 1398012660,
   30             'upload_date': '20140420',
   31             'thumbnail': r're:^https?://.+\.jpg',
   32             'uploader': '菊子桑',
   33             'uploader_id': '156160',
   34         },
   35     }
   36 
   37     _APP_KEY = '6f90a59ac58a4123'
   38     _BILIBILI_KEY = '0bfd84cc3940035173f35e6777508326'
   39 
   40     def _real_extract(self, url):
   41         video_id = self._match_id(url)
   42         webpage = self._download_webpage(url, video_id)
   43 
   44         if 'anime/v' not in url:
   45             cid = compat_parse_qs(self._search_regex(
   46                 [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
   47                  r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
   48                 webpage, 'player parameters'))['cid'][0]
   49         else:
   50             js = self._download_json(
   51                 'http://bangumi.bilibili.com/web_api/get_source', video_id,
   52                 data=urlencode_postdata({'episode_id': video_id}),
   53                 headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
   54             cid = js['result']['cid']
   55 
   56         payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid)
   57         sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
   58 
   59         video_info = self._download_json(
   60             'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign),
   61             video_id, note='Downloading video info page')
   62 
   63         entries = []
   64 
   65         for idx, durl in enumerate(video_info['durl']):
   66             formats = [{
   67                 'url': durl['url'],
   68                 'filesize': int_or_none(durl['size']),
   69             }]
   70             for backup_url in durl.get('backup_url', []):
   71                 formats.append({
   72                     'url': backup_url,
   73                     # backup URLs have lower priorities
   74                     'preference': -2 if 'hd.mp4' in backup_url else -3,
   75                 })
   76 
   77             self._sort_formats(formats)
   78 
   79             entries.append({
   80                 'id': '%s_part%s' % (video_id, idx),
   81                 'duration': float_or_none(durl.get('length'), 1000),
   82                 'formats': formats,
   83             })
   84 
   85         title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
   86         description = self._html_search_meta('description', webpage)
   87         timestamp = unified_timestamp(self._html_search_regex(
   88             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False))
   89         thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
   90 
   91         # TODO 'view_count' requires deobfuscating Javascript
   92         info = {
   93             'id': video_id,
   94             'title': title,
   95             'description': description,
   96             'timestamp': timestamp,
   97             'thumbnail': thumbnail,
   98             'duration': float_or_none(video_info.get('timelength'), scale=1000),
   99         }
  100 
  101         uploader_mobj = re.search(
  102             r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
  103             webpage)
  104         if uploader_mobj:
  105             info.update({
  106                 'uploader': uploader_mobj.group('name'),
  107                 'uploader_id': uploader_mobj.group('id'),
  108             })
  109 
  110         for entry in entries:
  111             entry.update(info)
  112 
  113         if len(entries) == 1:
  114             return entries[0]
  115         else:
  116             for idx, entry in enumerate(entries):
  117                 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
  118 
  119             return {
  120                 '_type': 'multi_video',
  121                 'id': video_id,
  122                 'title': title,
  123                 'description': description,
  124                 'entries': entries,
  125             }