youtube_dl/extractor/infoq.py



    1 # coding: utf-8
    2 
    3 from __future__ import unicode_literals
    4 
    5 from ..compat import (
    6     compat_b64decode,
    7     compat_urllib_parse_unquote,
    8     compat_urlparse,
    9 )
   10 from ..utils import (
   11     determine_ext,
   12     update_url_query,
   13 )
   14 from .bokecc import BokeCCBaseIE
   15 
   16 
   17 class InfoQIE(BokeCCBaseIE):
   18     _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
   19 
   20     _TESTS = [{
   21         'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
   22         'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
   23         'info_dict': {
   24             'id': 'A-Few-of-My-Favorite-Python-Things',
   25             'ext': 'mp4',
   26             'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
   27             'title': 'A Few of My Favorite [Python] Things',
   28         },
   29     }, {
   30         'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
   31         'only_matching': True,
   32     }, {
   33         'url': 'http://www.infoq.com/cn/presentations/openstack-continued-delivery',
   34         'md5': '4918d0cca1497f2244572caf626687ef',
   35         'info_dict': {
   36             'id': 'openstack-continued-delivery',
   37             'title': 'OpenStack持续交付之路',
   38             'ext': 'flv',
   39             'description': 'md5:308d981fb28fa42f49f9568322c683ff',
   40         },
   41     }, {
   42         'url': 'https://www.infoq.com/presentations/Simple-Made-Easy',
   43         'md5': '0e34642d4d9ef44bf86f66f6399672db',
   44         'info_dict': {
   45             'id': 'Simple-Made-Easy',
   46             'title': 'Simple Made Easy',
   47             'ext': 'mp3',
   48             'description': 'md5:3e0e213a8bbd074796ef89ea35ada25b',
   49         },
   50         'params': {
   51             'format': 'bestaudio',
   52         },
   53     }]
   54 
   55     def _extract_rtmp_video(self, webpage):
   56         # The server URL is hardcoded
   57         video_url = 'rtmpe://video.infoq.com/cfx/st/'
   58 
   59         # Extract video URL
   60         encoded_id = self._search_regex(
   61             r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None)
   62 
   63         real_id = compat_urllib_parse_unquote(compat_b64decode(encoded_id).decode('utf-8'))
   64         playpath = 'mp4:' + real_id
   65 
   66         return [{
   67             'format_id': 'rtmp_video',
   68             'url': video_url,
   69             'ext': determine_ext(playpath),
   70             'play_path': playpath,
   71         }]
   72 
   73     def _extract_cf_auth(self, webpage):
   74         policy = self._search_regex(r'InfoQConstants\.scp\s*=\s*\'([^\']+)\'', webpage, 'policy')
   75         signature = self._search_regex(r'InfoQConstants\.scs\s*=\s*\'([^\']+)\'', webpage, 'signature')
   76         key_pair_id = self._search_regex(r'InfoQConstants\.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id')
   77         return {
   78             'Policy': policy,
   79             'Signature': signature,
   80             'Key-Pair-Id': key_pair_id,
   81         }
   82 
   83     def _extract_http_video(self, webpage):
   84         http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL')
   85         http_video_url = update_url_query(http_video_url, self._extract_cf_auth(webpage))
   86         return [{
   87             'format_id': 'http_video',
   88             'url': http_video_url,
   89         }]
   90 
   91     def _extract_http_audio(self, webpage, video_id):
   92         fields = self._hidden_inputs(webpage)
   93         http_audio_url = fields.get('filename')
   94         if not http_audio_url:
   95             return []
   96 
   97         # base URL is found in the Location header in the response returned by
   98         # GET https://www.infoq.com/mp3download.action?filename=... when logged in.
   99         http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url)
  100         http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage))
  101 
  102         # audio file seem to be missing some times even if there is a download link
  103         # so probe URL to make sure
  104         if not self._is_valid_url(http_audio_url, video_id):
  105             return []
  106 
  107         return [{
  108             'format_id': 'http_audio',
  109             'url': http_audio_url,
  110             'vcodec': 'none',
  111         }]
  112 
  113     def _real_extract(self, url):
  114         video_id = self._match_id(url)
  115         webpage = self._download_webpage(url, video_id)
  116 
  117         video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
  118         video_description = self._html_search_meta('description', webpage, 'description')
  119 
  120         if '/cn/' in url:
  121             # for China videos, HTTP video URL exists but always fails with 403
  122             formats = self._extract_bokecc_formats(webpage, video_id)
  123         else:
  124             formats = (
  125                 self._extract_rtmp_video(webpage)
  126                 + self._extract_http_video(webpage)
  127                 + self._extract_http_audio(webpage, video_id))
  128 
  129         self._sort_formats(formats)
  130 
  131         return {
  132             'id': video_id,
  133             'title': video_title,
  134             'description': video_description,
  135             'formats': formats,
  136         }