youtube_dl/extractor/thisav.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import re
    5 
    6 from .common import InfoExtractor
    7 from ..utils import remove_end
    8 
    9 
   10 class ThisAVIE(InfoExtractor):
   11     _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
   12     _TESTS = [{
   13         # jwplayer
   14         'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
   15         'md5': '0480f1ef3932d901f0e0e719f188f19b',
   16         'info_dict': {
   17             'id': '47734',
   18             'ext': 'flv',
   19             'title': '高樹マリア - Just fit',
   20             'uploader': 'dj7970',
   21             'uploader_id': 'dj7970'
   22         }
   23     }, {
   24         # html5 media
   25         'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html',
   26         'md5': 'ba90c076bd0f80203679e5b60bf523ee',
   27         'info_dict': {
   28             'id': '242352',
   29             'ext': 'mp4',
   30             'title': 'Nerdy 18yo Big Ass Tattoos and Glasses',
   31             'uploader': 'cybersluts',
   32             'uploader_id': 'cybersluts',
   33         },
   34     }]
   35 
   36     def _real_extract(self, url):
   37         mobj = re.match(self._VALID_URL, url)
   38 
   39         video_id = mobj.group('id')
   40         webpage = self._download_webpage(url, video_id)
   41         title = remove_end(self._html_search_regex(
   42             r'<title>([^<]+)</title>', webpage, 'title'),
   43             ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
   44         video_url = self._html_search_regex(
   45             r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
   46         if video_url:
   47             info_dict = {
   48                 'formats': [{
   49                     'url': video_url,
   50                 }],
   51             }
   52         else:
   53             entries = self._parse_html5_media_entries(url, webpage, video_id)
   54             if entries:
   55                 info_dict = entries[0]
   56             else:
   57                 info_dict = self._extract_jwplayer_data(
   58                     webpage, video_id, require_title=False)
   59         uploader = self._html_search_regex(
   60             r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
   61             webpage, 'uploader name', fatal=False)
   62         uploader_id = self._html_search_regex(
   63             r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
   64             webpage, 'uploader id', fatal=False)
   65 
   66         info_dict.update({
   67             'id': video_id,
   68             'uploader': uploader,
   69             'uploader_id': uploader_id,
   70             'title': title,
   71         })
   72 
   73         return info_dict