youtube_dl/extractor/apa.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import re
    5 
    6 from .common import InfoExtractor
    7 from ..utils import (
    8     determine_ext,
    9     js_to_json,
   10     url_or_none,
   11 )
   12 
   13 
   14 class APAIE(InfoExtractor):
   15     _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
   16     _TESTS = [{
   17         'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
   18         'md5': '2b12292faeb0a7d930c778c7a5b4759b',
   19         'info_dict': {
   20             'id': 'jjv85FdZ',
   21             'ext': 'mp4',
   22             'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
   23             'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
   24             'thumbnail': r're:^https?://.*\.jpg$',
   25             'duration': 254,
   26             'timestamp': 1519211149,
   27             'upload_date': '20180221',
   28         },
   29     }, {
   30         'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78',
   31         'only_matching': True,
   32     }, {
   33         'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76',
   34         'only_matching': True,
   35     }, {
   36         'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81',
   37         'only_matching': True,
   38     }]
   39 
   40     @staticmethod
   41     def _extract_urls(webpage):
   42         return [
   43             mobj.group('url')
   44             for mobj in re.finditer(
   45                 r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
   46                 webpage)]
   47 
   48     def _real_extract(self, url):
   49         video_id = self._match_id(url)
   50 
   51         webpage = self._download_webpage(url, video_id)
   52 
   53         jwplatform_id = self._search_regex(
   54             r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage,
   55             'jwplatform id', default=None)
   56 
   57         if jwplatform_id:
   58             return self.url_result(
   59                 'jwplatform:' + jwplatform_id, ie='JWPlatform',
   60                 video_id=video_id)
   61 
   62         sources = self._parse_json(
   63             self._search_regex(
   64                 r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'),
   65             video_id, transform_source=js_to_json)
   66 
   67         formats = []
   68         for source in sources:
   69             if not isinstance(source, dict):
   70                 continue
   71             source_url = url_or_none(source.get('file'))
   72             if not source_url:
   73                 continue
   74             ext = determine_ext(source_url)
   75             if ext == 'm3u8':
   76                 formats.extend(self._extract_m3u8_formats(
   77                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
   78                     m3u8_id='hls', fatal=False))
   79             else:
   80                 formats.append({
   81                     'url': source_url,
   82                 })
   83         self._sort_formats(formats)
   84 
   85         thumbnail = self._search_regex(
   86             r'image\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
   87             'thumbnail', fatal=False, group='url')
   88 
   89         return {
   90             'id': video_id,
   91             'title': video_id,
   92             'thumbnail': thumbnail,
   93             'formats': formats,
   94         }