youtube_dl/extractor/massengeschmacktv.py



    1 from __future__ import unicode_literals
    2 
    3 import re
    4 
    5 from .common import InfoExtractor
    6 from ..utils import (
    7     clean_html,
    8     determine_ext,
    9     int_or_none,
   10     js_to_json,
   11     mimetype2ext,
   12     parse_filesize,
   13 )
   14 
   15 
   16 class MassengeschmackTVIE(InfoExtractor):
   17     IE_NAME = 'massengeschmack.tv'
   18     _VALID_URL = r'https?://(?:www\.)?massengeschmack\.tv/play/(?P<id>[^?&#]+)'
   19 
   20     _TEST = {
   21         'url': 'https://massengeschmack.tv/play/fktv202',
   22         'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3',
   23         'info_dict': {
   24             'id': 'fktv202',
   25             'ext': 'mp4',
   26             'title': 'Fernsehkritik-TV - Folge 202',
   27         },
   28     }
   29 
   30     def _real_extract(self, url):
   31         episode = self._match_id(url)
   32 
   33         webpage = self._download_webpage(url, episode)
   34         title = clean_html(self._html_search_regex(
   35             '<h3>([^<]+)</h3>', webpage, 'title'))
   36         thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False)
   37         sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json)
   38 
   39         formats = []
   40         for source in sources:
   41             furl = source.get('src')
   42             if not furl:
   43                 continue
   44             furl = self._proto_relative_url(furl)
   45             ext = determine_ext(furl) or mimetype2ext(source.get('type'))
   46             if ext == 'm3u8':
   47                 formats.extend(self._extract_m3u8_formats(
   48                     furl, episode, 'mp4', 'm3u8_native',
   49                     m3u8_id='hls', fatal=False))
   50             else:
   51                 formats.append({
   52                     'url': furl,
   53                     'format_id': determine_ext(furl),
   54                 })
   55 
   56         for (durl, format_id, width, height, filesize) in re.findall(r'''(?x)
   57                                    <a[^>]+?href="(?P<url>(?:https:)?//[^"]+)".*?
   58                                    <strong>(?P<format_id>.+?)</strong>.*?
   59                                    <small>(?:(?P<width>\d+)x(?P<height>\d+))?\s+?\((?P<filesize>[\d,]+\s*[GM]iB)\)</small>
   60                                 ''', webpage):
   61             formats.append({
   62                 'url': durl,
   63                 'format_id': format_id,
   64                 'width': int_or_none(width),
   65                 'height': int_or_none(height),
   66                 'filesize': parse_filesize(filesize),
   67                 'vcodec': 'none' if format_id.startswith('Audio') else None,
   68             })
   69 
   70         self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr'))
   71 
   72         return {
   73             'id': episode,
   74             'title': title,
   75             'formats': formats,
   76             'thumbnail': thumbnail,
   77         }