youtube_dl/extractor/motherless.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import datetime
    5 import re
    6 
    7 from .common import InfoExtractor
    8 from ..compat import compat_urlparse
    9 from ..utils import (
   10     ExtractorError,
   11     InAdvancePagedList,
   12     orderedSet,
   13     str_to_int,
   14     unified_strdate,
   15 )
   16 
   17 
   18 class MotherlessIE(InfoExtractor):
   19     _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
   20     _TESTS = [{
   21         'url': 'http://motherless.com/AC3FFE1',
   22         'md5': '310f62e325a9fafe64f68c0bccb6e75f',
   23         'info_dict': {
   24             'id': 'AC3FFE1',
   25             'ext': 'mp4',
   26             'title': 'Fucked in the ass while playing PS3',
   27             'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
   28             'upload_date': '20100913',
   29             'uploader_id': 'famouslyfuckedup',
   30             'thumbnail': r're:https?://.*\.jpg',
   31             'age_limit': 18,
   32         }
   33     }, {
   34         'url': 'http://motherless.com/532291B',
   35         'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
   36         'info_dict': {
   37             'id': '532291B',
   38             'ext': 'mp4',
   39             'title': 'Amazing girl playing the omegle game, PERFECT!',
   40             'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
   41                            'game', 'hairy'],
   42             'upload_date': '20140622',
   43             'uploader_id': 'Sulivana7x',
   44             'thumbnail': r're:https?://.*\.jpg',
   45             'age_limit': 18,
   46         },
   47         'skip': '404',
   48     }, {
   49         'url': 'http://motherless.com/g/cosplay/633979F',
   50         'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
   51         'info_dict': {
   52             'id': '633979F',
   53             'ext': 'mp4',
   54             'title': 'Turtlette',
   55             'categories': ['superheroine heroine  superher'],
   56             'upload_date': '20140827',
   57             'uploader_id': 'shade0230',
   58             'thumbnail': r're:https?://.*\.jpg',
   59             'age_limit': 18,
   60         }
   61     }, {
   62         # no keywords
   63         'url': 'http://motherless.com/8B4BBC1',
   64         'only_matching': True,
   65     }, {
   66         # see https://motherless.com/videos/recent for recent videos with
   67         # uploaded date in "ago" format
   68         'url': 'https://motherless.com/3C3E2CF',
   69         'info_dict': {
   70             'id': '3C3E2CF',
   71             'ext': 'mp4',
   72             'title': 'a/ Hot Teens',
   73             'categories': list,
   74             'upload_date': '20210104',
   75             'uploader_id': 'anonymous',
   76             'thumbnail': r're:https?://.*\.jpg',
   77             'age_limit': 18,
   78         },
   79         'params': {
   80             'skip_download': True,
   81         },
   82     }]
   83 
   84     def _real_extract(self, url):
   85         video_id = self._match_id(url)
   86         webpage = self._download_webpage(url, video_id)
   87 
   88         if any(p in webpage for p in (
   89                 '<title>404 - MOTHERLESS.COM<',
   90                 ">The page you're looking for cannot be found.<")):
   91             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
   92 
   93         if '>The content you are trying to view is for friends only.' in webpage:
   94             raise ExtractorError('Video %s is for friends only' % video_id, expected=True)
   95 
   96         title = self._html_search_regex(
   97             (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>',
   98              r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title')
   99         video_url = (self._html_search_regex(
  100             (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
  101              r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
  102             webpage, 'video URL', default=None, group='url')
  103             or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
  104         age_limit = self._rta_search(webpage)
  105         view_count = str_to_int(self._html_search_regex(
  106             (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
  107             webpage, 'view count', fatal=False))
  108         like_count = str_to_int(self._html_search_regex(
  109             (r'>([\d,.]+)\s+Favorites<',
  110              r'<strong>Favorited</strong>\s+([^<]+)<'),
  111             webpage, 'like count', fatal=False))
  112 
  113         upload_date = unified_strdate(self._search_regex(
  114             r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
  115             'upload date', default=None))
  116         if not upload_date:
  117             uploaded_ago = self._search_regex(
  118                 r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
  119                 default=None)
  120             if uploaded_ago:
  121                 delta = int(uploaded_ago[:-1])
  122                 _AGO_UNITS = {
  123                     'h': 'hours',
  124                     'd': 'days',
  125                 }
  126                 kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
  127                 upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
  128 
  129         comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
  130         uploader_id = self._html_search_regex(
  131             (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''',
  132              r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''),
  133             webpage, 'uploader_id')
  134 
  135         categories = self._html_search_meta('keywords', webpage, default=None)
  136         if categories:
  137             categories = [cat.strip() for cat in categories.split(',')]
  138 
  139         return {
  140             'id': video_id,
  141             'title': title,
  142             'upload_date': upload_date,
  143             'uploader_id': uploader_id,
  144             'thumbnail': self._og_search_thumbnail(webpage),
  145             'categories': categories,
  146             'view_count': view_count,
  147             'like_count': like_count,
  148             'comment_count': comment_count,
  149             'age_limit': age_limit,
  150             'url': video_url,
  151         }
  152 
  153 
  154 class MotherlessGroupIE(InfoExtractor):
  155     _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
  156     _TESTS = [{
  157         'url': 'http://motherless.com/g/movie_scenes',
  158         'info_dict': {
  159             'id': 'movie_scenes',
  160             'title': 'Movie Scenes',
  161             'description': 'Hot and sexy scenes from "regular" movies... '
  162                            'Beautiful actresses fully nude... A looot of '
  163                            'skin! :)Enjoy!',
  164         },
  165         'playlist_mincount': 662,
  166     }, {
  167         'url': 'http://motherless.com/gv/sex_must_be_funny',
  168         'info_dict': {
  169             'id': 'sex_must_be_funny',
  170             'title': 'Sex must be funny',
  171             'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
  172                            'any kind!'
  173         },
  174         'playlist_mincount': 0,
  175         'expected_warnings': [
  176             'This group has no videos.',
  177         ]
  178     }, {
  179         'url': 'https://motherless.com/g/beautiful_cock',
  180         'info_dict': {
  181             'id': 'beautiful_cock',
  182             'title': 'Beautiful Cock',
  183             'description': 'Group for lovely cocks yours, mine, a friends anything human',
  184         },
  185         'playlist_mincount': 2500,
  186     }]
  187 
  188     @classmethod
  189     def suitable(cls, url):
  190         return (False if MotherlessIE.suitable(url)
  191                 else super(MotherlessGroupIE, cls).suitable(url))
  192 
  193     def _extract_entries(self, webpage, base):
  194         entries = []
  195         for mobj in re.finditer(
  196                 r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
  197                 webpage):
  198             video_url = compat_urlparse.urljoin(base, mobj.group('href'))
  199             if not MotherlessIE.suitable(video_url):
  200                 continue
  201             video_id = MotherlessIE._match_id(video_url)
  202             title = mobj.group('title')
  203             entries.append(self.url_result(
  204                 video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
  205                 video_title=title))
  206         # Alternative fallback
  207         if not entries:
  208             entries = [
  209                 self.url_result(
  210                     compat_urlparse.urljoin(base, '/' + entry_id),
  211                     ie=MotherlessIE.ie_key(), video_id=entry_id)
  212                 for entry_id in orderedSet(re.findall(
  213                     r'data-codename=["\']([A-Z0-9]+)', webpage))]
  214         return entries
  215 
  216     def _real_extract(self, url):
  217         group_id = self._match_id(url)
  218         page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id)
  219         webpage = self._download_webpage(page_url, group_id)
  220         title = self._search_regex(
  221             r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
  222         description = self._html_search_meta(
  223             'description', webpage, fatal=False)
  224         page_count = str_to_int(self._search_regex(
  225             r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
  226             webpage, 'page_count', default=0))
  227         if not page_count:
  228             message = self._search_regex(
  229                 r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''',
  230                 webpage, 'error_msg', default=None) or 'This group has no videos.'
  231             self.report_warning(message, group_id)
  232             page_count = 1
  233         PAGE_SIZE = 80
  234 
  235         def _get_page(idx):
  236             if idx > 0:
  237                 webpage = self._download_webpage(
  238                     page_url, group_id, query={'page': idx + 1},
  239                     note='Downloading page %d/%d' % (idx + 1, page_count)
  240                 )
  241             for entry in self._extract_entries(webpage, url):
  242                 yield entry
  243 
  244         playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
  245 
  246         return {
  247             '_type': 'playlist',
  248             'id': group_id,
  249             'title': title,
  250             'description': description,
  251             'entries': playlist
  252         }