youtube_dl/extractor/youjizz.py



    1 from __future__ import unicode_literals
    2 
    3 import re
    4 
    5 from .common import InfoExtractor
    6 from ..compat import compat_str
    7 from ..utils import (
    8     determine_ext,
    9     int_or_none,
   10     parse_duration,
   11 )
   12 
   13 
   14 class YouJizzIE(InfoExtractor):
   15     _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'
   16     _TESTS = [{
   17         'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
   18         'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',
   19         'info_dict': {
   20             'id': '2189178',
   21             'ext': 'mp4',
   22             'title': 'Zeichentrick 1',
   23             'age_limit': 18,
   24             'duration': 2874,
   25         }
   26     }, {
   27         'url': 'http://www.youjizz.com/videos/-2189178.html',
   28         'only_matching': True,
   29     }, {
   30         'url': 'https://www.youjizz.com/videos/embed/31991001',
   31         'only_matching': True,
   32     }]
   33 
   34     def _real_extract(self, url):
   35         mobj = re.match(self._VALID_URL, url)
   36         video_id = mobj.group('id') or mobj.group('embed_id')
   37 
   38         webpage = self._download_webpage(url, video_id)
   39 
   40         title = self._html_search_regex(
   41             r'<title>(.+?)</title>', webpage, 'title')
   42 
   43         formats = []
   44 
   45         encodings = self._parse_json(
   46             self._search_regex(
   47                 r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
   48                 default='[]'),
   49             video_id, fatal=False)
   50         for encoding in encodings:
   51             if not isinstance(encoding, dict):
   52                 continue
   53             format_url = encoding.get('filename')
   54             if not isinstance(format_url, compat_str):
   55                 continue
   56             if determine_ext(format_url) == 'm3u8':
   57                 formats.extend(self._extract_m3u8_formats(
   58                     format_url, video_id, 'mp4', entry_protocol='m3u8_native',
   59                     m3u8_id='hls', fatal=False))
   60             else:
   61                 format_id = encoding.get('name') or encoding.get('quality')
   62                 height = int_or_none(self._search_regex(
   63                     r'^(\d+)[pP]', format_id, 'height', default=None))
   64                 formats.append({
   65                     'url': format_url,
   66                     'format_id': format_id,
   67                     'height': height,
   68                 })
   69 
   70         if formats:
   71             info_dict = {
   72                 'formats': formats,
   73             }
   74         else:
   75             # YouJizz's HTML5 player has invalid HTML
   76             webpage = webpage.replace('"controls', '" controls')
   77             info_dict = self._parse_html5_media_entries(
   78                 url, webpage, video_id)[0]
   79 
   80         duration = parse_duration(self._search_regex(
   81             r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration',
   82             default=None))
   83         uploader = self._search_regex(
   84             r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader',
   85             default=None)
   86 
   87         info_dict.update({
   88             'id': video_id,
   89             'title': title,
   90             'age_limit': self._rta_search(webpage),
   91             'duration': duration,
   92             'uploader': uploader,
   93         })
   94 
   95         return info_dict