summaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/dailymotion.py
blob: 439033d2310d1be9034c225c9ed7eee39b9c56bd (plain)
    1 import re
    2 import json
    3 import itertools
    4 
    5 from .common import InfoExtractor
    6 from ..utils import (
    7     compat_urllib_request,
    8     get_element_by_attribute,
    9     get_element_by_id,
   10 
   11     ExtractorError,
   12 )
   13 
   14 class DailymotionIE(InfoExtractor):
   15     """Information Extractor for Dailymotion"""
   16 
   17     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
   18     IE_NAME = u'dailymotion'
   19     _TEST = {
   20         u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
   21         u'file': u'x33vw9.mp4',
   22         u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
   23         u'info_dict': {
   24             u"uploader": u"Amphora Alex and Van .", 
   25             u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
   26         }
   27     }
   28 
   29     def _real_extract(self, url):
   30         # Extract id and simplified title from URL
   31         mobj = re.match(self._VALID_URL, url)
   32 
   33         video_id = mobj.group(1).split('_')[0].split('?')[0]
   34 
   35         video_extension = 'mp4'
   36 
   37         # Retrieve video webpage to extract further information
   38         request = compat_urllib_request.Request(url)
   39         request.add_header('Cookie', 'family_filter=off')
   40         webpage = self._download_webpage(request, video_id)
   41 
   42         # Extract URL, uploader and title from webpage
   43         self.report_extraction(video_id)
   44 
   45         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
   46                                              # Looking for official user
   47                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
   48                                             webpage, 'video uploader')
   49 
   50         video_upload_date = None
   51         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
   52         if mobj is not None:
   53             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
   54 
   55         embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
   56         embed_page = self._download_webpage(embed_url, video_id,
   57                                             u'Downloading embed page')
   58         info = self._search_regex(r'var info = ({.*?}),$', embed_page,
   59             'video info', flags=re.MULTILINE)
   60         info = json.loads(info)
   61 
   62         # TODO: support choosing qualities
   63 
   64         for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
   65                     'stream_h264_hq_url','stream_h264_url',
   66                     'stream_h264_ld_url']:
   67             if info.get(key):#key in info and info[key]:
   68                 max_quality = key
   69                 self.to_screen(u'Using %s' % key)
   70                 break
   71         else:
   72             raise ExtractorError(u'Unable to extract video URL')
   73         video_url = info[max_quality]
   74 
   75         return [{
   76             'id':       video_id,
   77             'url':      video_url,
   78             'uploader': video_uploader,
   79             'upload_date':  video_upload_date,
   80             'title':    self._og_search_title(webpage),
   81             'ext':      video_extension,
   82             'thumbnail': info['thumbnail_url']
   83         }]
   84 
   85 
   86 class DailymotionPlaylistIE(InfoExtractor):
   87     _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
   88     _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
   89 
   90     def _real_extract(self, url):
   91         mobj = re.match(self._VALID_URL, url)
   92         playlist_id =  mobj.group('id')
   93         video_ids = []
   94 
   95         for pagenum in itertools.count(1):
   96             webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum),
   97                                              playlist_id, u'Downloading page %s' % pagenum)
   98 
   99             playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
  100             video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el))
  101 
  102             if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
  103                 break
  104 
  105         entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
  106                    for video_id in video_ids]
  107         return {'_type': 'playlist',
  108                 'id': playlist_id,
  109                 'title': get_element_by_id(u'playlist_name', webpage),
  110                 'entries': entries,
  111                 }

Generated by cgit