summaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/motherless.py
blob: 35d2b46ed31e47973f2a90106491508568e75de0 (plain)
    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import datetime
    5 import re
    6 
    7 from .common import InfoExtractor
    8 from ..compat import compat_urlparse
    9 from ..utils import (
   10     ExtractorError,
   11     InAdvancePagedList,
   12     orderedSet,
   13     str_to_int,
   14     unified_strdate,
   15 )
   16 
   17 
   18 class MotherlessIE(InfoExtractor):
   19     _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
   20     _TESTS = [{
   21         'url': 'http://motherless.com/AC3FFE1',
   22         'md5': '310f62e325a9fafe64f68c0bccb6e75f',
   23         'info_dict': {
   24             'id': 'AC3FFE1',
   25             'ext': 'mp4',
   26             'title': 'Fucked in the ass while playing PS3',
   27             'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
   28             'upload_date': '20100913',
   29             'uploader_id': 'famouslyfuckedup',
   30             'thumbnail': r're:https?://.*\.jpg',
   31             'age_limit': 18,
   32         }
   33     }, {
   34         'url': 'http://motherless.com/532291B',
   35         'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
   36         'info_dict': {
   37             'id': '532291B',
   38             'ext': 'mp4',
   39             'title': 'Amazing girl playing the omegle game, PERFECT!',
   40             'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
   41                            'game', 'hairy'],
   42             'upload_date': '20140622',
   43             'uploader_id': 'Sulivana7x',
   44             'thumbnail': r're:https?://.*\.jpg',
   45             'age_limit': 18,
   46         },
   47         'skip': '404',
   48     }, {
   49         'url': 'http://motherless.com/g/cosplay/633979F',
   50         'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
   51         'info_dict': {
   52             'id': '633979F',
   53             'ext': 'mp4',
   54             'title': 'Turtlette',
   55             'categories': ['superheroine heroine  superher'],
   56             'upload_date': '20140827',
   57             'uploader_id': 'shade0230',
   58             'thumbnail': r're:https?://.*\.jpg',
   59             'age_limit': 18,
   60         }
   61     }, {
   62         # no keywords
   63         'url': 'http://motherless.com/8B4BBC1',
   64         'only_matching': True,
   65     }, {
   66         # see https://motherless.com/videos/recent for recent videos with
   67         # uploaded date in "ago" format
   68         'url': 'https://motherless.com/3C3E2CF',
   69         'info_dict': {
   70             'id': '3C3E2CF',
   71             'ext': 'mp4',
   72             'title': 'a/ Hot Teens',
   73             'categories': list,
   74             'upload_date': '20210104',
   75             'uploader_id': 'anonymous',
   76             'thumbnail': r're:https?://.*\.jpg',
   77             'age_limit': 18,
   78         },
   79         'params': {
   80             'skip_download': True,
   81         },
   82     }]
   83 
   84     def _real_extract(self, url):
   85         video_id = self._match_id(url)
   86         webpage = self._download_webpage(url, video_id)
   87 
   88         if any(p in webpage for p in (
   89                 '<title>404 - MOTHERLESS.COM<',
   90                 ">The page you're looking for cannot be found.<")):
   91             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
   92 
   93         if '>The content you are trying to view is for friends only.' in webpage:
   94             raise ExtractorError('Video %s is for friends only' % video_id, expected=True)
   95 
   96         title = self._html_search_regex(
   97             (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>',
   98              r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title')
   99         video_url = (self._html_search_regex(
  100             (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
  101              r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
  102             webpage, 'video URL', default=None, group='url')
  103             or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
  104         age_limit = self._rta_search(webpage)
  105         view_count = str_to_int(self._html_search_regex(
  106             (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
  107             webpage, 'view count', fatal=False))
  108         like_count = str_to_int(self._html_search_regex(
  109             (r'>([\d,.]+)\s+Favorites<',
  110              r'<strong>Favorited</strong>\s+([^<]+)<'),
  111             webpage, 'like count', fatal=False))
  112 
  113         upload_date = unified_strdate(self._search_regex(
  114             r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
  115             'upload date', default=None))
  116         if not upload_date:
  117             uploaded_ago = self._search_regex(
  118                 r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
  119                 default=None)
  120             if uploaded_ago:
  121                 delta = int(uploaded_ago[:-1])
  122                 _AGO_UNITS = {
  123                     'h': 'hours',
  124                     'd': 'days',
  125                 }
  126                 kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
  127                 upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
  128 
  129         comment_count = webpage.count('class="media-comment-contents"')
  130         uploader_id = self._html_search_regex(
  131             r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''',
  132             webpage, 'uploader_id')
  133 
  134         categories = self._html_search_meta('keywords', webpage, default=None)
  135         if categories:
  136             categories = [cat.strip() for cat in categories.split(',')]
  137 
  138         return {
  139             'id': video_id,
  140             'title': title,
  141             'upload_date': upload_date,
  142             'uploader_id': uploader_id,
  143             'thumbnail': self._og_search_thumbnail(webpage),
  144             'categories': categories,
  145             'view_count': view_count,
  146             'like_count': like_count,
  147             'comment_count': comment_count,
  148             'age_limit': age_limit,
  149             'url': video_url,
  150         }
  151 
  152 
  153 class MotherlessGroupIE(InfoExtractor):
  154     _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
  155     _TESTS = [{
  156         'url': 'http://motherless.com/g/movie_scenes',
  157         'info_dict': {
  158             'id': 'movie_scenes',
  159             'title': 'Movie Scenes',
  160             'description': 'Hot and sexy scenes from "regular" movies... '
  161                            'Beautiful actresses fully nude... A looot of '
  162                            'skin! :)Enjoy!',
  163         },
  164         'playlist_mincount': 662,
  165     }, {
  166         'url': 'http://motherless.com/gv/sex_must_be_funny',
  167         'info_dict': {
  168             'id': 'sex_must_be_funny',
  169             'title': 'Sex must be funny',
  170             'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
  171                            'any kind!'
  172         },
  173         'playlist_mincount': 0,
  174     }]
  175 
  176     @classmethod
  177     def suitable(cls, url):
  178         return (False if MotherlessIE.suitable(url)
  179                 else super(MotherlessGroupIE, cls).suitable(url))
  180 
  181     def _extract_entries(self, webpage, base):
  182         entries = []
  183         for mobj in re.finditer(
  184                 r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
  185                 webpage):
  186             video_url = compat_urlparse.urljoin(base, mobj.group('href'))
  187             if not MotherlessIE.suitable(video_url):
  188                 continue
  189             video_id = MotherlessIE._match_id(video_url)
  190             title = mobj.group('title')
  191             entries.append(self.url_result(
  192                 video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
  193                 video_title=title))
  194         # Alternative fallback
  195         if not entries:
  196             entries = [
  197                 self.url_result(
  198                     compat_urlparse.urljoin(base, '/' + entry_id),
  199                     ie=MotherlessIE.ie_key(), video_id=entry_id)
  200                 for entry_id in orderedSet(re.findall(
  201                     r'data-codename=["\']([A-Z0-9]+)', webpage))]
  202         return entries
  203 
  204     def _real_extract(self, url):
  205         group_id = self._match_id(url)
  206         page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id)
  207         webpage = self._download_webpage(page_url, group_id)
  208         title = self._search_regex(
  209             r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
  210         description = self._html_search_meta(
  211             'description', webpage, fatal=False)
  212         page_count = str_to_int(self._search_regex(
  213             r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
  214             webpage, 'page_count', default='1'))
  215         PAGE_SIZE = 80
  216 
  217         def _get_page(idx):
  218             webpage = self._download_webpage(
  219                 page_url, group_id, query={'page': idx + 1},
  220                 note='Downloading page %d/%d' % (idx + 1, page_count)
  221             )
  222             for entry in self._extract_entries(webpage, url):
  223                 yield entry
  224 
  225         playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
  226 
  227         return {
  228             '_type': 'playlist',
  229             'id': group_id,
  230             'title': title,
  231             'description': description,
  232             'entries': playlist
  233         }

Generated by cgit