youtube_dl/extractor/twitcasting.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import re
    5 
    6 from .common import InfoExtractor
    7 from ..utils import (
    8     clean_html,
    9     float_or_none,
   10     get_element_by_class,
   11     get_element_by_id,
   12     parse_duration,
   13     str_to_int,
   14     unified_timestamp,
   15     urlencode_postdata,
   16 )
   17 
   18 
   19 class TwitCastingIE(InfoExtractor):
   20     _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)'
   21     _TESTS = [{
   22         'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609',
   23         'md5': '745243cad58c4681dc752490f7540d7f',
   24         'info_dict': {
   25             'id': '2357609',
   26             'ext': 'mp4',
   27             'title': 'Live #2357609',
   28             'uploader_id': 'ivetesangalo',
   29             'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.',
   30             'thumbnail': r're:^https?://.*\.jpg$',
   31             'upload_date': '20110822',
   32             'timestamp': 1314010824,
   33             'duration': 32,
   34             'view_count': int,
   35         },
   36         'params': {
   37             'skip_download': True,
   38         },
   39     }, {
   40         'url': 'https://twitcasting.tv/mttbernardini/movie/3689740',
   41         'info_dict': {
   42             'id': '3689740',
   43             'ext': 'mp4',
   44             'title': 'Live playing something #3689740',
   45             'uploader_id': 'mttbernardini',
   46             'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.',
   47             'thumbnail': r're:^https?://.*\.jpg$',
   48             'upload_date': '20120212',
   49             'timestamp': 1329028024,
   50             'duration': 681,
   51             'view_count': int,
   52         },
   53         'params': {
   54             'skip_download': True,
   55             'videopassword': 'abc',
   56         },
   57     }]
   58 
   59     def _real_extract(self, url):
   60         uploader_id, video_id = re.match(self._VALID_URL, url).groups()
   61 
   62         video_password = self._downloader.params.get('videopassword')
   63         request_data = None
   64         if video_password:
   65             request_data = urlencode_postdata({
   66                 'password': video_password,
   67             })
   68         webpage = self._download_webpage(url, video_id, data=request_data)
   69 
   70         title = clean_html(get_element_by_id(
   71             'movietitle', webpage)) or self._html_search_meta(
   72             ['og:title', 'twitter:title'], webpage, fatal=True)
   73 
   74         video_js_data = {}
   75         m3u8_url = self._search_regex(
   76             r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
   77             webpage, 'm3u8 url', group='url', default=None)
   78         if not m3u8_url:
   79             video_js_data = self._parse_json(self._search_regex(
   80                 r"data-movie-playlist='(\[[^']+\])'",
   81                 webpage, 'movie playlist'), video_id)[0]
   82             m3u8_url = video_js_data['source']['url']
   83 
   84         # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
   85         formats = self._extract_m3u8_formats(
   86             m3u8_url, video_id, 'mp4', m3u8_id='hls')
   87 
   88         thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage)
   89         description = clean_html(get_element_by_id(
   90             'authorcomment', webpage)) or self._html_search_meta(
   91             ['description', 'og:description', 'twitter:description'], webpage)
   92         duration = float_or_none(video_js_data.get(
   93             'duration'), 1000) or parse_duration(clean_html(
   94                 get_element_by_class('tw-player-duration-time', webpage)))
   95         view_count = str_to_int(self._search_regex(
   96             r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None))
   97         timestamp = unified_timestamp(self._search_regex(
   98             r'data-toggle="true"[^>]+datetime="([^"]+)"',
   99             webpage, 'datetime', None))
  100 
  101         return {
  102             'id': video_id,
  103             'title': title,
  104             'description': description,
  105             'thumbnail': thumbnail,
  106             'timestamp': timestamp,
  107             'uploader_id': uploader_id,
  108             'duration': duration,
  109             'view_count': view_count,
  110             'formats': formats,
  111         }