summaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/thisvid.py
blob: bc4bcb2d1f489d0a4bca49ebc43b98261dc8f8d5 (plain)
    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import re
    5 import itertools
    6 
    7 from .common import InfoExtractor
    8 from ..compat import (
    9     compat_urlparse,
   10 )
   11 from ..utils import (
   12     clean_html,
   13     get_element_by_class,
   14     int_or_none,
   15     merge_dicts,
   16     url_or_none,
   17     urljoin,
   18 )
   19 
   20 
   21 class ThisVidIE(InfoExtractor):
   22     _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
   23     _TESTS = [{
   24         'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
   25         'md5': '839becb572995687e11a69dc4358a386',
   26         'info_dict': {
   27             'id': '3533241',
   28             'ext': 'mp4',
   29             'title': 'Sitting on ball tight jeans',
   30             'description': 'md5:372353bb995883d1b65fddf507489acd',
   31             'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
   32             'uploader_id': '150629',
   33             'uploader': 'jeanslevisjeans',
   34             'age_limit': 18,
   35         }
   36     }, {
   37         'url': 'https://thisvid.com/embed/3533241/',
   38         'md5': '839becb572995687e11a69dc4358a386',
   39         'info_dict': {
   40             'id': '3533241',
   41             'ext': 'mp4',
   42             'title': 'Sitting on ball tight jeans',
   43             'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
   44             'uploader_id': '150629',
   45             'uploader': 'jeanslevisjeans',
   46             'age_limit': 18,
   47         }
   48     }]
   49 
   50     def _real_extract(self, url):
   51         main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
   52         webpage = self._download_webpage(url, main_id)
   53 
   54         title = self._html_search_regex(
   55             r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
   56             webpage, 'title')
   57 
   58         if type_ == 'embed':
   59             # look for more metadata
   60             video_alt_url = url_or_none(self._search_regex(
   61                 r'''video_alt_url\s*:\s+'(%s/)',''' % (self._VALID_URL, ),
   62                 webpage, 'video_alt_url', default=None))
   63             if video_alt_url and video_alt_url != url:
   64                 webpage = self._download_webpage(
   65                     video_alt_url, main_id,
   66                     note='Redirecting embed to main page', fatal=False) or webpage
   67 
   68         video_holder = get_element_by_class('video-holder', webpage) or ''
   69         if '>This video is a private video' in video_holder:
   70             self.raise_login_required(
   71                 (clean_html(video_holder) or 'Private video').split('\n', 1)[0])
   72 
   73         uploader = self._html_search_regex(
   74             r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
   75             webpage, 'uploader', default='')
   76         uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
   77         if len(uploader) == 2:
   78             # id must be non-empty, uploader could be ''
   79             uploader_id, uploader = uploader
   80             uploader = uploader or None
   81         else:
   82             uploader_id = uploader = None
   83 
   84         return merge_dicts({
   85             '_type': 'url_transparent',
   86             'title': title,
   87             'age_limit': 18,
   88             'uploader': uploader,
   89             'uploader_id': uploader_id,
   90         }, self.url_result(url, ie='Generic'))
   91 
   92 
   93 class ThisVidMemberIE(InfoExtractor):
   94     _VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
   95     _TESTS = [{
   96         'url': 'https://thisvid.com/members/2140501/',
   97         'info_dict': {
   98             'id': '2140501',
   99             'title': 'Rafflesia\'s Profile',
  100         },
  101         'playlist_mincount': 16,
  102     }, {
  103         'url': 'https://thisvid.com/members/2140501/favourite_videos/',
  104         'info_dict': {
  105             'id': '2140501',
  106             'title': 'Rafflesia\'s Favourite Videos',
  107         },
  108         'playlist_mincount': 15,
  109     }, {
  110         'url': 'https://thisvid.com/members/636468/public_videos/',
  111         'info_dict': {
  112             'id': '636468',
  113             'title': 'Happymouth\'s Public Videos',
  114         },
  115         'playlist_mincount': 196,
  116     },
  117     ]
  118 
  119     def _urls(self, html):
  120         for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (ThisVidIE._VALID_URL, ), html):
  121             yield m.group('url')
  122 
  123     def _real_extract(self, url):
  124         pl_id = self._match_id(url)
  125         webpage = self._download_webpage(url, pl_id)
  126 
  127         title = re.split(
  128             r'(?i)\s*\|\s*ThisVid\.com\s*$',
  129             self._og_search_title(webpage, default=None) or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None
  130 
  131         def entries(page_url, html=None):
  132             for page in itertools.count(1):
  133                 if not html:
  134                     html = self._download_webpage(
  135                         page_url, pl_id, note='Downloading page %d' % (page, ),
  136                         fatal=False) or ''
  137                 for u in self._urls(html):
  138                     yield u
  139                 next_page = get_element_by_class('pagination-next', html) or ''
  140                 if next_page:
  141                     # member list page
  142                     next_page = urljoin(url, self._search_regex(
  143                         r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
  144                         next_page, 'next page link', group='url', default=None))
  145                 # in case a member page should have pagination-next with empty link, not just `else:`
  146                 if next_page is None:
  147                     # playlist page
  148                     parsed_url = compat_urlparse.urlparse(page_url)
  149                     base_path, num = parsed_url.path.rsplit('/', 1)
  150                     num = int_or_none(num)
  151                     if num is None:
  152                         base_path, num = parsed_url.path.rstrip('/'), 1
  153                     parsed_url = parsed_url._replace(path=base_path + ('/%d' % (num + 1, )))
  154                     next_page = compat_urlparse.urlunparse(parsed_url)
  155                     if page_url == next_page:
  156                         next_page = None
  157                 if not next_page:
  158                     break
  159                 page_url, html = next_page, None
  160 
  161         return self.playlist_from_matches(
  162             entries(url, webpage), playlist_id=pl_id, playlist_title=title, ie='ThisVid')
  163 
  164 
  165 class ThisVidPlaylistIE(ThisVidMemberIE):
  166     _VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
  167     _TESTS = [{
  168         'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
  169         'info_dict': {
  170             'id': '6615',
  171             'title': 'Underwear Stuff',
  172         },
  173         'playlist_mincount': 200,
  174     }, {
  175         'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
  176         'info_dict': {
  177             'id': '1072387',
  178             'ext': 'mp4',
  179             'title': 'Big Italian Booty 28',
  180             'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
  181             'uploader_id': '367912',
  182             'uploader': 'Jcmusclefun',
  183             'age_limit': 18,
  184         },
  185         'params': {
  186             'noplaylist': True,
  187         },
  188     }]
  189 
  190     def _get_video_url(self, pl_url):
  191         video_id = re.match(self._VALID_URL, pl_url).group('video_id')
  192         return urljoin(pl_url, '/videos/%s/' % (video_id, ))
  193 
  194     def _urls(self, html):
  195         for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (self._VALID_URL, ), html):
  196             yield self._get_video_url(m.group('url'))
  197 
  198     def _real_extract(self, url):
  199         pl_id = self._match_id(url)
  200 
  201         if self._downloader.params.get('noplaylist'):
  202             self.to_screen('Downloading just the featured video because of --no-playlist')
  203             return self.url_result(self._get_video_url(url), 'ThisVid')
  204 
  205         self.to_screen(
  206             'Downloading playlist %s - add --no-playlist to download just the featured video' % (pl_id, ))
  207         result = super(ThisVidPlaylistIE, self)._real_extract(url)
  208 
  209         # rework title returned as `the title - the title`
  210         title = result['title']
  211         t_len = len(title)
  212         if t_len > 5 and t_len % 2 != 0:
  213             t_len = t_len // 2
  214             if title[t_len] == '-':
  215                 title = [t.strip() for t in (title[:t_len], title[t_len + 1:])]
  216                 if title[0] and title[0] == title[1]:
  217                     result['title'] = title[0]
  218         return result

Generated by cgit