youtube_dl/InfoExtractors.py



    1 #!/usr/bin/env python
    2 # -*- coding: utf-8 -*-
    3 
    4 from __future__ import absolute_import
    5 
    6 import base64
    7 import datetime
    8 import itertools
    9 import netrc
   10 import os
   11 import re
   12 import socket
   13 import time
   14 import email.utils
   15 import xml.etree.ElementTree
   16 import random
   17 import math
   18 import operator
   19 
   20 from .utils import *
   21 
   22 
   23 class InfoExtractor(object):
   24     """Information Extractor class.
   25 
   26     Information extractors are the classes that, given a URL, extract
   27     information about the video (or videos) the URL refers to. This
   28     information includes the real video URL, the video title, author and
   29     others. The information is stored in a dictionary which is then
   30     passed to the FileDownloader. The FileDownloader processes this
   31     information possibly downloading the video to the file system, among
   32     other possible outcomes.
   33 
   34     The dictionaries must include the following fields:
   35 
   36     id:             Video identifier.
   37     url:            Final video URL.
   38     title:          Video title, unescaped.
   39     ext:            Video filename extension.
   40 
   41     The following fields are optional:
   42 
   43     format:         The video format, defaults to ext (used for --get-format)
   44     thumbnail:      Full URL to a video thumbnail image.
   45     description:    One-line video description.
   46     uploader:       Full name of the video uploader.
   47     upload_date:    Video upload date (YYYYMMDD).
   48     uploader_id:    Nickname or id of the video uploader.
   49     location:       Physical location of the video.
   50     player_url:     SWF Player URL (used for rtmpdump).
   51     subtitles:      The subtitle file contents.
   52     urlhandle:      [internal] The urlHandle to be used to download the file,
   53                     like returned by urllib.request.urlopen
   54 
   55     The fields should all be Unicode strings.
   56 
   57     Subclasses of this one should re-define the _real_initialize() and
   58     _real_extract() methods and define a _VALID_URL regexp.
   59     Probably, they should also be added to the list of extractors.
   60 
   61     _real_extract() must return a *list* of information dictionaries as
   62     described above.
   63 
   64     Finally, the _WORKING attribute should be set to False for broken IEs
   65     in order to warn the users and skip the tests.
   66     """
   67 
   68     _ready = False
   69     _downloader = None
   70     _WORKING = True
   71 
   72     def __init__(self, downloader=None):
   73         """Constructor. Receives an optional downloader."""
   74         self._ready = False
   75         self.set_downloader(downloader)
   76 
   77     @classmethod
   78     def suitable(cls, url):
   79         """Receives a URL and returns True if suitable for this IE."""
   80         return re.match(cls._VALID_URL, url) is not None
   81 
   82     @classmethod
   83     def working(cls):
   84         """Getter method for _WORKING."""
   85         return cls._WORKING
   86 
   87     def initialize(self):
   88         """Initializes an instance (authentication, etc)."""
   89         if not self._ready:
   90             self._real_initialize()
   91             self._ready = True
   92 
   93     def extract(self, url):
   94         """Extracts URL information and returns it in list of dicts."""
   95         self.initialize()
   96         return self._real_extract(url)
   97 
   98     def set_downloader(self, downloader):
   99         """Sets the downloader for this IE."""
  100         self._downloader = downloader
  101 
  102     def _real_initialize(self):
  103         """Real initialization process. Redefine in subclasses."""
  104         pass
  105 
  106     def _real_extract(self, url):
  107         """Real extraction process. Redefine in subclasses."""
  108         pass
  109 
  110     @property
  111     def IE_NAME(self):
  112         return type(self).__name__[:-2]
  113 
  114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
  115         """ Returns the response handle """
  116         if note is None:
  117             self.report_download_webpage(video_id)
  118         elif note is not False:
  119             self.to_screen(u'%s: %s' % (video_id, note))
  120         try:
  121             return compat_urllib_request.urlopen(url_or_request)
  122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  123             if errnote is None:
  124                 errnote = u'Unable to download webpage'
  125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
  126 
  127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
  128         """ Returns the data of the page as a string """
  129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
  130         content_type = urlh.headers.get('Content-Type', '')
  131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
  132         if m:
  133             encoding = m.group(1)
  134         else:
  135             encoding = 'utf-8'
  136         webpage_bytes = urlh.read()
  137         if self._downloader.params.get('dump_intermediate_pages', False):
  138             try:
  139                 url = url_or_request.get_full_url()
  140             except AttributeError:
  141                 url = url_or_request
  142             self.to_screen(u'Dumping request to ' + url)
  143             dump = base64.b64encode(webpage_bytes).decode('ascii')
  144             self._downloader.to_screen(dump)
  145         return webpage_bytes.decode(encoding, 'replace')
  146 
  147     def to_screen(self, msg):
  148         """Print msg to screen, prefixing it with '[ie_name]'"""
  149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
  150 
  151     def report_extraction(self, id_or_name):
  152         """Report information extraction."""
  153         self.to_screen(u'%s: Extracting information' % id_or_name)
  154 
  155     def report_download_webpage(self, video_id):
  156         """Report webpage download."""
  157         self.to_screen(u'%s: Downloading webpage' % video_id)
  158 
  159     def report_age_confirmation(self):
  160         """Report attempt to confirm age."""
  161         self.to_screen(u'Confirming age')
  162 
  163     #Methods for following #608
  164     #They set the correct value of the '_type' key
  165     def video_result(self, video_info):
  166         """Returns a video"""
  167         video_info['_type'] = 'video'
  168         return video_info
  169     def url_result(self, url, ie=None):
  170         """Returns a url that points to a page that should be processed"""
  171         #TODO: ie should be the class used for getting the info
  172         video_info = {'_type': 'url',
  173                       'url': url,
  174                       'ie_key': ie}
  175         return video_info
  176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
  177         """Returns a playlist"""
  178         video_info = {'_type': 'playlist',
  179                       'entries': entries}
  180         if playlist_id:
  181             video_info['id'] = playlist_id
  182         if playlist_title:
  183             video_info['title'] = playlist_title
  184         return video_info
  185 
  186 
  187 class YoutubeIE(InfoExtractor):
  188     """Information extractor for youtube.com."""
  189 
  190     _VALID_URL = r"""^
  191                      (
  192                          (?:https?://)?                                       # http(s):// (optional)
  193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
  194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
  195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
  196                          (?:                                                  # the various things that can precede the ID:
  197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
  198                              |(?:                                             # or the v= param in all its forms
  199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
  200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
  201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
  202                                  v=
  203                              )
  204                          )?                                                   # optional -> youtube.com/xxxx is OK
  205                      )?                                                       # all until now is optional -> you can pass the naked ID
  206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
  207                      (?(1).+)?                                                # if we found the ID, everything can follow
  208                      $"""
  209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
  213     _NETRC_MACHINE = 'youtube'
  214     # Listed in order of quality
  215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
  216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
  217     _video_extensions = {
  218         '13': '3gp',
  219         '17': 'mp4',
  220         '18': 'mp4',
  221         '22': 'mp4',
  222         '37': 'mp4',
  223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
  224         '43': 'webm',
  225         '44': 'webm',
  226         '45': 'webm',
  227         '46': 'webm',
  228     }
  229     _video_dimensions = {
  230         '5': '240x400',
  231         '6': '???',
  232         '13': '???',
  233         '17': '144x176',
  234         '18': '360x640',
  235         '22': '720x1280',
  236         '34': '360x640',
  237         '35': '480x854',
  238         '37': '1080x1920',
  239         '38': '3072x4096',
  240         '43': '360x640',
  241         '44': '480x854',
  242         '45': '720x1280',
  243         '46': '1080x1920',
  244     }
  245     IE_NAME = u'youtube'
  246 
  247     @classmethod
  248     def suitable(cls, url):
  249         """Receives a URL and returns True if suitable for this IE."""
  250         if YoutubePlaylistIE.suitable(url): return False
  251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  252 
  253     def report_lang(self):
  254         """Report attempt to set language."""
  255         self.to_screen(u'Setting language')
  256 
  257     def report_login(self):
  258         """Report attempt to log in."""
  259         self.to_screen(u'Logging in')
  260 
  261     def report_video_webpage_download(self, video_id):
  262         """Report attempt to download video webpage."""
  263         self.to_screen(u'%s: Downloading video webpage' % video_id)
  264 
  265     def report_video_info_webpage_download(self, video_id):
  266         """Report attempt to download video info webpage."""
  267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
  268 
  269     def report_video_subtitles_download(self, video_id):
  270         """Report attempt to download video info webpage."""
  271         self.to_screen(u'%s: Checking available subtitles' % video_id)
  272 
  273     def report_video_subtitles_request(self, video_id, sub_lang, format):
  274         """Report attempt to download video info webpage."""
  275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
  276 
  277     def report_video_subtitles_available(self, video_id, sub_lang_list):
  278         """Report available subtitles."""
  279         sub_lang = ",".join(list(sub_lang_list.keys()))
  280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
  281 
  282     def report_information_extraction(self, video_id):
  283         """Report attempt to extract video information."""
  284         self.to_screen(u'%s: Extracting video information' % video_id)
  285 
  286     def report_unavailable_format(self, video_id, format):
  287         """Report extracted video URL."""
  288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
  289 
  290     def report_rtmp_download(self):
  291         """Indicate the download will use the RTMP protocol."""
  292         self.to_screen(u'RTMP download detected')
  293 
  294     def _get_available_subtitles(self, video_id):
  295         self.report_video_subtitles_download(video_id)
  296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
  297         try:
  298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
  299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
  301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
  302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
  303         if not sub_lang_list:
  304             return (u'video doesn\'t have subtitles', None)
  305         return sub_lang_list
  306 
  307     def _list_available_subtitles(self, video_id):
  308         sub_lang_list = self._get_available_subtitles(video_id)
  309         self.report_video_subtitles_available(video_id, sub_lang_list)
  310 
  311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
  312         """
  313         Return tuple:
  314         (error_message, sub_lang, sub)
  315         """
  316         self.report_video_subtitles_request(video_id, sub_lang, format)
  317         params = compat_urllib_parse.urlencode({
  318             'lang': sub_lang,
  319             'name': sub_name,
  320             'v': video_id,
  321             'fmt': format,
  322         })
  323         url = 'http://www.youtube.com/api/timedtext?' + params
  324         try:
  325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
  326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
  328         if not sub:
  329             return (u'Did not fetch video subtitles', None, None)
  330         return (None, sub_lang, sub)
  331 
  332     def _extract_subtitle(self, video_id):
  333         """
  334         Return a list with a tuple:
  335         [(error_message, sub_lang, sub)]
  336         """
  337         sub_lang_list = self._get_available_subtitles(video_id)
  338         sub_format = self._downloader.params.get('subtitlesformat')
  339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
  340             return [(sub_lang_list[0], None, None)]
  341         if self._downloader.params.get('subtitleslang', False):
  342             sub_lang = self._downloader.params.get('subtitleslang')
  343         elif 'en' in sub_lang_list:
  344             sub_lang = 'en'
  345         else:
  346             sub_lang = list(sub_lang_list.keys())[0]
  347         if not sub_lang in sub_lang_list:
  348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
  349 
  350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
  351         return [subtitle]
  352 
  353     def _extract_all_subtitles(self, video_id):
  354         sub_lang_list = self._get_available_subtitles(video_id)
  355         sub_format = self._downloader.params.get('subtitlesformat')
  356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
  357             return [(sub_lang_list[0], None, None)]
  358         subtitles = []
  359         for sub_lang in sub_lang_list:
  360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
  361             subtitles.append(subtitle)
  362         return subtitles
  363 
  364     def _print_formats(self, formats):
  365         print('Available formats:')
  366         for x in formats:
  367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
  368 
  369     def _real_initialize(self):
  370         if self._downloader is None:
  371             return
  372 
  373         username = None
  374         password = None
  375         downloader_params = self._downloader.params
  376 
  377         # Attempt to use provided username and password or .netrc data
  378         if downloader_params.get('username', None) is not None:
  379             username = downloader_params['username']
  380             password = downloader_params['password']
  381         elif downloader_params.get('usenetrc', False):
  382             try:
  383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
  384                 if info is not None:
  385                     username = info[0]
  386                     password = info[2]
  387                 else:
  388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
  389             except (IOError, netrc.NetrcParseError) as err:
  390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
  391                 return
  392 
  393         # Set language
  394         request = compat_urllib_request.Request(self._LANG_URL)
  395         try:
  396             self.report_lang()
  397             compat_urllib_request.urlopen(request).read()
  398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
  400             return
  401 
  402         # No authentication to be performed
  403         if username is None:
  404             return
  405 
  406         request = compat_urllib_request.Request(self._LOGIN_URL)
  407         try:
  408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
  409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
  411             return
  412 
  413         galx = None
  414         dsh = None
  415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
  416         if match:
  417           galx = match.group(1)
  418 
  419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
  420         if match:
  421           dsh = match.group(1)
  422 
  423         # Log in
  424         login_form_strs = {
  425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  426                 u'Email': username,
  427                 u'GALX': galx,
  428                 u'Passwd': password,
  429                 u'PersistentCookie': u'yes',
  430                 u'_utf8': u'霱',
  431                 u'bgresponse': u'js_disabled',
  432                 u'checkConnection': u'',
  433                 u'checkedDomains': u'youtube',
  434                 u'dnConn': u'',
  435                 u'dsh': dsh,
  436                 u'pstMsg': u'0',
  437                 u'rmShown': u'1',
  438                 u'secTok': u'',
  439                 u'signIn': u'Sign in',
  440                 u'timeStmp': u'',
  441                 u'service': u'youtube',
  442                 u'uilel': u'3',
  443                 u'hl': u'en_US',
  444         }
  445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  446         # chokes on unicode
  447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  450         try:
  451             self.report_login()
  452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
  453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
  454                 self._downloader.report_warning(u'unable to log in: bad username or password')
  455                 return
  456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
  458             return
  459 
  460         # Confirm age
  461         age_form = {
  462                 'next_url':     '/',
  463                 'action_confirm':   'Confirm',
  464                 }
  465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
  466         try:
  467             self.report_age_confirmation()
  468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
  469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
  471             return
  472 
  473     def _extract_id(self, url):
  474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
  475         if mobj is None:
  476             self._downloader.report_error(u'invalid URL: %s' % url)
  477             return
  478         video_id = mobj.group(2)
  479         return video_id
  480 
  481     def _real_extract(self, url):
  482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
  483         mobj = re.search(self._NEXT_URL_RE, url)
  484         if mobj:
  485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
  486         video_id = self._extract_id(url)
  487 
  488         # Get video webpage
  489         self.report_video_webpage_download(video_id)
  490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
  491         request = compat_urllib_request.Request(url)
  492         try:
  493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
  494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
  496             return
  497 
  498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
  499 
  500         # Attempt to extract SWF player URL
  501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
  502         if mobj is not None:
  503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
  504         else:
  505             player_url = None
  506 
  507         # Get video info
  508         self.report_video_info_webpage_download(video_id)
  509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
  510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
  511                     % (video_id, el_type))
  512             video_info_webpage = self._download_webpage(video_info_url, video_id,
  513                                     note=False,
  514                                     errnote='unable to download video info webpage')
  515             video_info = compat_parse_qs(video_info_webpage)
  516             if 'token' in video_info:
  517                 break
  518         if 'token' not in video_info:
  519             if 'reason' in video_info:
  520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
  521             else:
  522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
  523             return
  524 
  525         # Check for "rental" videos
  526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
  527             self._downloader.report_error(u'"rental" videos not supported')
  528             return
  529 
  530         # Start extracting information
  531         self.report_information_extraction(video_id)
  532 
  533         # uploader
  534         if 'author' not in video_info:
  535             self._downloader.report_error(u'unable to extract uploader name')
  536             return
  537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
  538 
  539         # uploader_id
  540         video_uploader_id = None
  541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
  542         if mobj is not None:
  543             video_uploader_id = mobj.group(1)
  544         else:
  545             self._downloader.report_warning(u'unable to extract uploader nickname')
  546 
  547         # title
  548         if 'title' not in video_info:
  549             self._downloader.report_error(u'unable to extract video title')
  550             return
  551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
  552 
  553         # thumbnail image
  554         if 'thumbnail_url' not in video_info:
  555             self._downloader.report_warning(u'unable to extract video thumbnail')
  556             video_thumbnail = ''
  557         else:   # don't panic if we can't find it
  558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
  559 
  560         # upload date
  561         upload_date = None
  562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
  563         if mobj is not None:
  564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
  565             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
  566             for expression in format_expressions:
  567                 try:
  568                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
  569                 except:
  570                     pass
  571 
  572         # description
  573         video_description = get_element_by_id("eow-description", video_webpage)
  574         if video_description:
  575             video_description = clean_html(video_description)
  576         else:
  577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
  578             if fd_mobj:
  579                 video_description = unescapeHTML(fd_mobj.group(1))
  580             else:
  581                 video_description = u''
  582 
  583         # subtitles
  584         video_subtitles = None
  585 
  586         if self._downloader.params.get('writesubtitles', False):
  587             video_subtitles = self._extract_subtitle(video_id)
  588             if video_subtitles:
  589                 (sub_error, sub_lang, sub) = video_subtitles[0]
  590                 if sub_error:
  591                     self._downloader.report_error(sub_error)
  592 
  593         if self._downloader.params.get('allsubtitles', False):
  594             video_subtitles = self._extract_all_subtitles(video_id)
  595             for video_subtitle in video_subtitles:
  596                 (sub_error, sub_lang, sub) = video_subtitle
  597                 if sub_error:
  598                     self._downloader.report_error(sub_error)
  599 
  600         if self._downloader.params.get('listsubtitles', False):
  601             sub_lang_list = self._list_available_subtitles(video_id)
  602             return
  603 
  604         if 'length_seconds' not in video_info:
  605             self._downloader.report_warning(u'unable to extract video duration')
  606             video_duration = ''
  607         else:
  608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
  609 
  610         # token
  611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
  612 
  613         # Decide which formats to download
  614         req_format = self._downloader.params.get('format', None)
  615 
  616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
  617             self.report_rtmp_download()
  618             video_url_list = [(None, video_info['conn'][0])]
  619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
  620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
  621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
  622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
  623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
  624 
  625             format_limit = self._downloader.params.get('format_limit', None)
  626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
  627             if format_limit is not None and format_limit in available_formats:
  628                 format_list = available_formats[available_formats.index(format_limit):]
  629             else:
  630                 format_list = available_formats
  631             existing_formats = [x for x in format_list if x in url_map]
  632             if len(existing_formats) == 0:
  633                 raise ExtractorError(u'no known formats available for video')
  634             if self._downloader.params.get('listformats', None):
  635                 self._print_formats(existing_formats)
  636                 return
  637             if req_format is None or req_format == 'best':
  638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
  639             elif req_format == 'worst':
  640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
  641             elif req_format in ('-1', 'all'):
  642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
  643             else:
  644                 # Specific formats. We pick the first in a slash-delimeted sequence.
  645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
  646                 req_formats = req_format.split('/')
  647                 video_url_list = None
  648                 for rf in req_formats:
  649                     if rf in url_map:
  650                         video_url_list = [(rf, url_map[rf])]
  651                         break
  652                 if video_url_list is None:
  653                     raise ExtractorError(u'requested format not available')
  654         else:
  655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
  656 
  657         results = []
  658         for format_param, video_real_url in video_url_list:
  659             # Extension
  660             video_extension = self._video_extensions.get(format_param, 'flv')
  661 
  662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
  663                                               self._video_dimensions.get(format_param, '???'))
  664 
  665             results.append({
  666                 'id':       video_id,
  667                 'url':      video_real_url,
  668                 'uploader': video_uploader,
  669                 'uploader_id': video_uploader_id,
  670                 'upload_date':  upload_date,
  671                 'title':    video_title,
  672                 'ext':      video_extension,
  673                 'format':   video_format,
  674                 'thumbnail':    video_thumbnail,
  675                 'description':  video_description,
  676                 'player_url':   player_url,
  677                 'subtitles':    video_subtitles,
  678                 'duration':     video_duration
  679             })
  680         return results
  681 
  682 
  683 class MetacafeIE(InfoExtractor):
  684     """Information Extractor for metacafe.com."""
  685 
  686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
  687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
  688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
  689     IE_NAME = u'metacafe'
  690 
  691     def report_disclaimer(self):
  692         """Report disclaimer retrieval."""
  693         self.to_screen(u'Retrieving disclaimer')
  694 
  695     def _real_initialize(self):
  696         # Retrieve disclaimer
  697         request = compat_urllib_request.Request(self._DISCLAIMER)
  698         try:
  699             self.report_disclaimer()
  700             disclaimer = compat_urllib_request.urlopen(request).read()
  701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
  703             return
  704 
  705         # Confirm age
  706         disclaimer_form = {
  707             'filters': '0',
  708             'submit': "Continue - I'm over 18",
  709             }
  710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
  711         try:
  712             self.report_age_confirmation()
  713             disclaimer = compat_urllib_request.urlopen(request).read()
  714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
  716             return
  717 
  718     def _real_extract(self, url):
  719         # Extract id and simplified title from URL
  720         mobj = re.match(self._VALID_URL, url)
  721         if mobj is None:
  722             self._downloader.report_error(u'invalid URL: %s' % url)
  723             return
  724 
  725         video_id = mobj.group(1)
  726 
  727         # Check if video comes from YouTube
  728         mobj2 = re.match(r'^yt-(.*)$', video_id)
  729         if mobj2 is not None:
  730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
  731 
  732         # Retrieve video webpage to extract further information
  733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
  734 
  735         # Extract URL, uploader and title from webpage
  736         self.report_extraction(video_id)
  737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
  738         if mobj is not None:
  739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
  740             video_extension = mediaURL[-3:]
  741 
  742             # Extract gdaKey if available
  743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
  744             if mobj is None:
  745                 video_url = mediaURL
  746             else:
  747                 gdaKey = mobj.group(1)
  748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
  749         else:
  750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
  751             if mobj is None:
  752                 self._downloader.report_error(u'unable to extract media URL')
  753                 return
  754             vardict = compat_parse_qs(mobj.group(1))
  755             if 'mediaData' not in vardict:
  756                 self._downloader.report_error(u'unable to extract media URL')
  757                 return
  758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
  759             if mobj is None:
  760                 self._downloader.report_error(u'unable to extract media URL')
  761                 return
  762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
  763             video_extension = mediaURL[-3:]
  764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
  765 
  766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
  767         if mobj is None:
  768             self._downloader.report_error(u'unable to extract title')
  769             return
  770         video_title = mobj.group(1).decode('utf-8')
  771 
  772         mobj = re.search(r'submitter=(.*?);', webpage)
  773         if mobj is None:
  774             self._downloader.report_error(u'unable to extract uploader nickname')
  775             return
  776         video_uploader = mobj.group(1)
  777 
  778         return [{
  779             'id':       video_id.decode('utf-8'),
  780             'url':      video_url.decode('utf-8'),
  781             'uploader': video_uploader.decode('utf-8'),
  782             'upload_date':  None,
  783             'title':    video_title,
  784             'ext':      video_extension.decode('utf-8'),
  785         }]
  786 
  787 
  788 class DailymotionIE(InfoExtractor):
  789     """Information Extractor for Dailymotion"""
  790 
  791     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
  792     IE_NAME = u'dailymotion'
  793     _WORKING = False
  794 
  795     def _real_extract(self, url):
  796         # Extract id and simplified title from URL
  797         mobj = re.match(self._VALID_URL, url)
  798         if mobj is None:
  799             self._downloader.report_error(u'invalid URL: %s' % url)
  800             return
  801 
  802         video_id = mobj.group(1).split('_')[0].split('?')[0]
  803 
  804         video_extension = 'mp4'
  805 
  806         # Retrieve video webpage to extract further information
  807         request = compat_urllib_request.Request(url)
  808         request.add_header('Cookie', 'family_filter=off')
  809         webpage = self._download_webpage(request, video_id)
  810 
  811         # Extract URL, uploader and title from webpage
  812         self.report_extraction(video_id)
  813         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
  814         if mobj is None:
  815             self._downloader.report_error(u'unable to extract media URL')
  816             return
  817         flashvars = compat_urllib_parse.unquote(mobj.group(1))
  818 
  819         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
  820             if key in flashvars:
  821                 max_quality = key
  822                 self.to_screen(u'Using %s' % key)
  823                 break
  824         else:
  825             self._downloader.report_error(u'unable to extract video URL')
  826             return
  827 
  828         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
  829         if mobj is None:
  830             self._downloader.report_error(u'unable to extract video URL')
  831             return
  832 
  833         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
  834 
  835         # TODO: support choosing qualities
  836 
  837         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
  838         if mobj is None:
  839             self._downloader.report_error(u'unable to extract title')
  840             return
  841         video_title = unescapeHTML(mobj.group('title'))
  842 
  843         video_uploader = None
  844         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
  845         if mobj is None:
  846             # lookin for official user
  847             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
  848             if mobj_official is None:
  849                 self._downloader.report_warning(u'unable to extract uploader nickname')
  850             else:
  851                 video_uploader = mobj_official.group(1)
  852         else:
  853             video_uploader = mobj.group(1)
  854 
  855         video_upload_date = None
  856         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
  857         if mobj is not None:
  858             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
  859 
  860         return [{
  861             'id':       video_id,
  862             'url':      video_url,
  863             'uploader': video_uploader,
  864             'upload_date':  video_upload_date,
  865             'title':    video_title,
  866             'ext':      video_extension,
  867         }]
  868 
  869 
  870 class PhotobucketIE(InfoExtractor):
  871     """Information extractor for photobucket.com."""
  872 
  873     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
  874     IE_NAME = u'photobucket'
  875 
  876     def _real_extract(self, url):
  877         # Extract id from URL
  878         mobj = re.match(self._VALID_URL, url)
  879         if mobj is None:
  880             self._downloader.report_error(u'Invalid URL: %s' % url)
  881             return
  882 
  883         video_id = mobj.group(1)
  884 
  885         video_extension = 'flv'
  886 
  887         # Retrieve video webpage to extract further information
  888         request = compat_urllib_request.Request(url)
  889         try:
  890             self.report_download_webpage(video_id)
  891             webpage = compat_urllib_request.urlopen(request).read()
  892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  893             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
  894             return
  895 
  896         # Extract URL, uploader, and title from webpage
  897         self.report_extraction(video_id)
  898         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
  899         if mobj is None:
  900             self._downloader.report_error(u'unable to extract media URL')
  901             return
  902         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
  903 
  904         video_url = mediaURL
  905 
  906         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
  907         if mobj is None:
  908             self._downloader.report_error(u'unable to extract title')
  909             return
  910         video_title = mobj.group(1).decode('utf-8')
  911 
  912         video_uploader = mobj.group(2).decode('utf-8')
  913 
  914         return [{
  915             'id':       video_id.decode('utf-8'),
  916             'url':      video_url.decode('utf-8'),
  917             'uploader': video_uploader,
  918             'upload_date':  None,
  919             'title':    video_title,
  920             'ext':      video_extension.decode('utf-8'),
  921         }]
  922 
  923 
  924 class YahooIE(InfoExtractor):
  925     """Information extractor for video.yahoo.com."""
  926 
  927     _WORKING = False
  928     # _VALID_URL matches all Yahoo! Video URLs
  929     # _VPAGE_URL matches only the extractable '/watch/' URLs
  930     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
  931     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
  932     IE_NAME = u'video.yahoo'
  933 
  934     def _real_extract(self, url, new_video=True):
  935         # Extract ID from URL
  936         mobj = re.match(self._VALID_URL, url)
  937         if mobj is None:
  938             self._downloader.report_error(u'Invalid URL: %s' % url)
  939             return
  940 
  941         video_id = mobj.group(2)
  942         video_extension = 'flv'
  943 
  944         # Rewrite valid but non-extractable URLs as
  945         # extractable English language /watch/ URLs
  946         if re.match(self._VPAGE_URL, url) is None:
  947             request = compat_urllib_request.Request(url)
  948             try:
  949                 webpage = compat_urllib_request.urlopen(request).read()
  950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  951                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
  952                 return
  953 
  954             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
  955             if mobj is None:
  956                 self._downloader.report_error(u'Unable to extract id field')
  957                 return
  958             yahoo_id = mobj.group(1)
  959 
  960             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
  961             if mobj is None:
  962                 self._downloader.report_error(u'Unable to extract vid field')
  963                 return
  964             yahoo_vid = mobj.group(1)
  965 
  966             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
  967             return self._real_extract(url, new_video=False)
  968 
  969         # Retrieve video webpage to extract further information
  970         request = compat_urllib_request.Request(url)
  971         try:
  972             self.report_download_webpage(video_id)
  973             webpage = compat_urllib_request.urlopen(request).read()
  974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  975             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
  976             return
  977 
  978         # Extract uploader and title from webpage
  979         self.report_extraction(video_id)
  980         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
  981         if mobj is None:
  982             self._downloader.report_error(u'unable to extract video title')
  983             return
  984         video_title = mobj.group(1).decode('utf-8')
  985 
  986         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
  987         if mobj is None:
  988             self._downloader.report_error(u'unable to extract video uploader')
  989             return
  990         video_uploader = mobj.group(1).decode('utf-8')
  991 
  992         # Extract video thumbnail
  993         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
  994         if mobj is None:
  995             self._downloader.report_error(u'unable to extract video thumbnail')
  996             return
  997         video_thumbnail = mobj.group(1).decode('utf-8')
  998 
  999         # Extract video description
 1000         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 1001         if mobj is None:
 1002             self._downloader.report_error(u'unable to extract video description')
 1003             return
 1004         video_description = mobj.group(1).decode('utf-8')
 1005         if not video_description:
 1006             video_description = 'No description available.'
 1007 
 1008         # Extract video height and width
 1009         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 1010         if mobj is None:
 1011             self._downloader.report_error(u'unable to extract video height')
 1012             return
 1013         yv_video_height = mobj.group(1)
 1014 
 1015         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 1016         if mobj is None:
 1017             self._downloader.report_error(u'unable to extract video width')
 1018             return
 1019         yv_video_width = mobj.group(1)
 1020 
 1021         # Retrieve video playlist to extract media URL
 1022         # I'm not completely sure what all these options are, but we
 1023         # seem to need most of them, otherwise the server sends a 401.
 1024         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 1025         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 1026         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 1027                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 1028                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 1029         try:
 1030             self.report_download_webpage(video_id)
 1031             webpage = compat_urllib_request.urlopen(request).read()
 1032         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1033             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 1034             return
 1035 
 1036         # Extract media URL from playlist XML
 1037         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 1038         if mobj is None:
 1039             self._downloader.report_error(u'Unable to extract media URL')
 1040             return
 1041         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 1042         video_url = unescapeHTML(video_url)
 1043 
 1044         return [{
 1045             'id':       video_id.decode('utf-8'),
 1046             'url':      video_url,
 1047             'uploader': video_uploader,
 1048             'upload_date':  None,
 1049             'title':    video_title,
 1050             'ext':      video_extension.decode('utf-8'),
 1051             'thumbnail':    video_thumbnail.decode('utf-8'),
 1052             'description':  video_description,
 1053         }]
 1054 
 1055 
 1056 class VimeoIE(InfoExtractor):
 1057     """Information extractor for vimeo.com."""
 1058 
 1059     # _VALID_URL matches Vimeo URLs
 1060     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 1061     IE_NAME = u'vimeo'
 1062 
 1063     def _real_extract(self, url, new_video=True):
 1064         # Extract ID from URL
 1065         mobj = re.match(self._VALID_URL, url)
 1066         if mobj is None:
 1067             self._downloader.report_error(u'Invalid URL: %s' % url)
 1068             return
 1069 
 1070         video_id = mobj.group('id')
 1071         if not mobj.group('proto'):
 1072             url = 'https://' + url
 1073         if mobj.group('direct_link'):
 1074             url = 'https://vimeo.com/' + video_id
 1075 
 1076         # Retrieve video webpage to extract further information
 1077         request = compat_urllib_request.Request(url, None, std_headers)
 1078         try:
 1079             self.report_download_webpage(video_id)
 1080             webpage_bytes = compat_urllib_request.urlopen(request).read()
 1081             webpage = webpage_bytes.decode('utf-8')
 1082         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1083             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 1084             return
 1085 
 1086         # Now we begin extracting as much information as we can from what we
 1087         # retrieved. First we extract the information common to all extractors,
 1088         # and latter we extract those that are Vimeo specific.
 1089         self.report_extraction(video_id)
 1090 
 1091         # Extract the config JSON
 1092         try:
 1093             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 1094             config = json.loads(config)
 1095         except:
 1096             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
 1097                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
 1098             else:
 1099                 self._downloader.report_error(u'unable to extract info section')
 1100             return
 1101 
 1102         # Extract title
 1103         video_title = config["video"]["title"]
 1104 
 1105         # Extract uploader and uploader_id
 1106         video_uploader = config["video"]["owner"]["name"]
 1107         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
 1108 
 1109         # Extract video thumbnail
 1110         video_thumbnail = config["video"]["thumbnail"]
 1111 
 1112         # Extract video description
 1113         video_description = get_element_by_attribute("itemprop", "description", webpage)
 1114         if video_description: video_description = clean_html(video_description)
 1115         else: video_description = u''
 1116 
 1117         # Extract upload date
 1118         video_upload_date = None
 1119         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
 1120         if mobj is not None:
 1121             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
 1122 
 1123         # Vimeo specific: extract request signature and timestamp
 1124         sig = config['request']['signature']
 1125         timestamp = config['request']['timestamp']
 1126 
 1127         # Vimeo specific: extract video codec and quality information
 1128         # First consider quality, then codecs, then take everything
 1129         # TODO bind to format param
 1130         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
 1131         files = { 'hd': [], 'sd': [], 'other': []}
 1132         for codec_name, codec_extension in codecs:
 1133             if codec_name in config["video"]["files"]:
 1134                 if 'hd' in config["video"]["files"][codec_name]:
 1135                     files['hd'].append((codec_name, codec_extension, 'hd'))
 1136                 elif 'sd' in config["video"]["files"][codec_name]:
 1137                     files['sd'].append((codec_name, codec_extension, 'sd'))
 1138                 else:
 1139                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
 1140 
 1141         for quality in ('hd', 'sd', 'other'):
 1142             if len(files[quality]) > 0:
 1143                 video_quality = files[quality][0][2]
 1144                 video_codec = files[quality][0][0]
 1145                 video_extension = files[quality][0][1]
 1146                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
 1147                 break
 1148         else:
 1149             self._downloader.report_error(u'no known codec found')
 1150             return
 1151 
 1152         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 1153                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
 1154 
 1155         return [{
 1156             'id':       video_id,
 1157             'url':      video_url,
 1158             'uploader': video_uploader,
 1159             'uploader_id': video_uploader_id,
 1160             'upload_date':  video_upload_date,
 1161             'title':    video_title,
 1162             'ext':      video_extension,
 1163             'thumbnail':    video_thumbnail,
 1164             'description':  video_description,
 1165         }]
 1166 
 1167 
 1168 class ArteTvIE(InfoExtractor):
 1169     """arte.tv information extractor."""
 1170 
 1171     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
 1172     _LIVE_URL = r'index-[0-9]+\.html$'
 1173 
 1174     IE_NAME = u'arte.tv'
 1175 
 1176     def fetch_webpage(self, url):
 1177         request = compat_urllib_request.Request(url)
 1178         try:
 1179             self.report_download_webpage(url)
 1180             webpage = compat_urllib_request.urlopen(request).read()
 1181         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1182             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 1183             return
 1184         except ValueError as err:
 1185             self._downloader.report_error(u'Invalid URL: %s' % url)
 1186             return
 1187         return webpage
 1188 
 1189     def grep_webpage(self, url, regex, regexFlags, matchTuples):
 1190         page = self.fetch_webpage(url)
 1191         mobj = re.search(regex, page, regexFlags)
 1192         info = {}
 1193 
 1194         if mobj is None:
 1195             self._downloader.report_error(u'Invalid URL: %s' % url)
 1196             return
 1197 
 1198         for (i, key, err) in matchTuples:
 1199             if mobj.group(i) is None:
 1200                 self._downloader.report_error(err)
 1201                 return
 1202             else:
 1203                 info[key] = mobj.group(i)
 1204 
 1205         return info
 1206 
 1207     def extractLiveStream(self, url):
 1208         video_lang = url.split('/')[-4]
 1209         info = self.grep_webpage(
 1210             url,
 1211             r'src="(.*?/videothek_js.*?\.js)',
 1212             0,
 1213             [
 1214                 (1, 'url', u'Invalid URL: %s' % url)
 1215             ]
 1216         )
 1217         http_host = url.split('/')[2]
 1218         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
 1219         info = self.grep_webpage(
 1220             next_url,
 1221             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
 1222                 '(http://.*?\.swf).*?' +
 1223                 '(rtmp://.*?)\'',
 1224             re.DOTALL,
 1225             [
 1226                 (1, 'path',   u'could not extract video path: %s' % url),
 1227                 (2, 'player', u'could not extract video player: %s' % url),
 1228                 (3, 'url',    u'could not extract video url: %s' % url)
 1229             ]
 1230         )
 1231         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 1232 
 1233     def extractPlus7Stream(self, url):
 1234         video_lang = url.split('/')[-3]
 1235         info = self.grep_webpage(
 1236             url,
 1237             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
 1238             0,
 1239             [
 1240                 (1, 'url', u'Invalid URL: %s' % url)
 1241             ]
 1242         )
 1243         next_url = compat_urllib_parse.unquote(info.get('url'))
 1244         info = self.grep_webpage(
 1245             next_url,
 1246             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
 1247             0,
 1248             [
 1249                 (1, 'url', u'Could not find <video> tag: %s' % url)
 1250             ]
 1251         )
 1252         next_url = compat_urllib_parse.unquote(info.get('url'))
 1253 
 1254         info = self.grep_webpage(
 1255             next_url,
 1256             r'<video id="(.*?)".*?>.*?' +
 1257                 '<name>(.*?)</name>.*?' +
 1258                 '<dateVideo>(.*?)</dateVideo>.*?' +
 1259                 '<url quality="hd">(.*?)</url>',
 1260             re.DOTALL,
 1261             [
 1262                 (1, 'id',    u'could not extract video id: %s' % url),
 1263                 (2, 'title', u'could not extract video title: %s' % url),
 1264                 (3, 'date',  u'could not extract video date: %s' % url),
 1265                 (4, 'url',   u'could not extract video url: %s' % url)
 1266             ]
 1267         )
 1268 
 1269         return {
 1270             'id':           info.get('id'),
 1271             'url':          compat_urllib_parse.unquote(info.get('url')),
 1272             'uploader':     u'arte.tv',
 1273             'upload_date':  info.get('date'),
 1274             'title':        info.get('title').decode('utf-8'),
 1275             'ext':          u'mp4',
 1276             'format':       u'NA',
 1277             'player_url':   None,
 1278         }
 1279 
 1280     def _real_extract(self, url):
 1281         video_id = url.split('/')[-1]
 1282         self.report_extraction(video_id)
 1283 
 1284         if re.search(self._LIVE_URL, video_id) is not None:
 1285             self.extractLiveStream(url)
 1286             return
 1287         else:
 1288             info = self.extractPlus7Stream(url)
 1289 
 1290         return [info]
 1291 
 1292 
 1293 class GenericIE(InfoExtractor):
 1294     """Generic last-resort information extractor."""
 1295 
 1296     _VALID_URL = r'.*'
 1297     IE_NAME = u'generic'
 1298 
 1299     def report_download_webpage(self, video_id):
 1300         """Report webpage download."""
 1301         if not self._downloader.params.get('test', False):
 1302             self._downloader.report_warning(u'Falling back on generic information extractor.')
 1303         super(GenericIE, self).report_download_webpage(video_id)
 1304 
 1305     def report_following_redirect(self, new_url):
 1306         """Report information extraction."""
 1307         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
 1308 
 1309     def _test_redirect(self, url):
 1310         """Check if it is a redirect, like url shorteners, in case return the new url."""
 1311         class HeadRequest(compat_urllib_request.Request):
 1312             def get_method(self):
 1313                 return "HEAD"
 1314 
 1315         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
 1316             """
 1317             Subclass the HTTPRedirectHandler to make it use our
 1318             HeadRequest also on the redirected URL
 1319             """
 1320             def redirect_request(self, req, fp, code, msg, headers, newurl):
 1321                 if code in (301, 302, 303, 307):
 1322                     newurl = newurl.replace(' ', '%20')
 1323                     newheaders = dict((k,v) for k,v in req.headers.items()
 1324                                       if k.lower() not in ("content-length", "content-type"))
 1325                     return HeadRequest(newurl,
 1326                                        headers=newheaders,
 1327                                        origin_req_host=req.get_origin_req_host(),
 1328                                        unverifiable=True)
 1329                 else:
 1330                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
 1331 
 1332         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
 1333             """
 1334             Fallback to GET if HEAD is not allowed (405 HTTP error)
 1335             """
 1336             def http_error_405(self, req, fp, code, msg, headers):
 1337                 fp.read()
 1338                 fp.close()
 1339 
 1340                 newheaders = dict((k,v) for k,v in req.headers.items()
 1341                                   if k.lower() not in ("content-length", "content-type"))
 1342                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
 1343                                                  headers=newheaders,
 1344                                                  origin_req_host=req.get_origin_req_host(),
 1345                                                  unverifiable=True))
 1346 
 1347         # Build our opener
 1348         opener = compat_urllib_request.OpenerDirector()
 1349         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
 1350                         HTTPMethodFallback, HEADRedirectHandler,
 1351                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
 1352             opener.add_handler(handler())
 1353 
 1354         response = opener.open(HeadRequest(url))
 1355         new_url = response.geturl()
 1356 
 1357         if url == new_url:
 1358             return False
 1359 
 1360         self.report_following_redirect(new_url)
 1361         return new_url
 1362 
 1363     def _real_extract(self, url):
 1364         new_url = self._test_redirect(url)
 1365         if new_url: return [self.url_result(new_url)]
 1366 
 1367         video_id = url.split('/')[-1]
 1368         try:
 1369             webpage = self._download_webpage(url, video_id)
 1370         except ValueError as err:
 1371             # since this is the last-resort InfoExtractor, if
 1372             # this error is thrown, it'll be thrown here
 1373             self._downloader.report_error(u'Invalid URL: %s' % url)
 1374             return
 1375 
 1376         self.report_extraction(video_id)
 1377         # Start with something easy: JW Player in SWFObject
 1378         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 1379         if mobj is None:
 1380             # Broaden the search a little bit
 1381             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 1382         if mobj is None:
 1383             # Broaden the search a little bit: JWPlayer JS loader
 1384             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
 1385         if mobj is None:
 1386             self._downloader.report_error(u'Invalid URL: %s' % url)
 1387             return
 1388 
 1389         # It's possible that one of the regexes
 1390         # matched, but returned an empty group:
 1391         if mobj.group(1) is None:
 1392             self._downloader.report_error(u'Invalid URL: %s' % url)
 1393             return
 1394 
 1395         video_url = compat_urllib_parse.unquote(mobj.group(1))
 1396         video_id = os.path.basename(video_url)
 1397 
 1398         # here's a fun little line of code for you:
 1399         video_extension = os.path.splitext(video_id)[1][1:]
 1400         video_id = os.path.splitext(video_id)[0]
 1401 
 1402         # it's tempting to parse this further, but you would
 1403         # have to take into account all the variations like
 1404         #   Video Title - Site Name
 1405         #   Site Name | Video Title
 1406         #   Video Title - Tagline | Site Name
 1407         # and so on and so forth; it's just not practical
 1408         mobj = re.search(r'<title>(.*)</title>', webpage)
 1409         if mobj is None:
 1410             self._downloader.report_error(u'unable to extract title')
 1411             return
 1412         video_title = mobj.group(1)
 1413 
 1414         # video uploader is domain name
 1415         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
 1416         if mobj is None:
 1417             self._downloader.report_error(u'unable to extract title')
 1418             return
 1419         video_uploader = mobj.group(1)
 1420 
 1421         return [{
 1422             'id':       video_id,
 1423             'url':      video_url,
 1424             'uploader': video_uploader,
 1425             'upload_date':  None,
 1426             'title':    video_title,
 1427             'ext':      video_extension,
 1428         }]
 1429 
 1430 
 1431 class YoutubeSearchIE(InfoExtractor):
 1432     """Information Extractor for YouTube search queries."""
 1433     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
 1434     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
 1435     _max_youtube_results = 1000
 1436     IE_NAME = u'youtube:search'
 1437 
 1438     def report_download_page(self, query, pagenum):
 1439         """Report attempt to download search page with given number."""
 1440         query = query.decode(preferredencoding())
 1441         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 1442 
 1443     def _real_extract(self, query):
 1444         mobj = re.match(self._VALID_URL, query)
 1445         if mobj is None:
 1446             self._downloader.report_error(u'invalid search query "%s"' % query)
 1447             return
 1448 
 1449         prefix, query = query.split(':')
 1450         prefix = prefix[8:]
 1451         query = query.encode('utf-8')
 1452         if prefix == '':
 1453             return self._get_n_results(query, 1)
 1454         elif prefix == 'all':
 1455             self._get_n_results(query, self._max_youtube_results)
 1456         else:
 1457             try:
 1458                 n = int(prefix)
 1459                 if n <= 0:
 1460                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
 1461                     return
 1462                 elif n > self._max_youtube_results:
 1463                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
 1464                     n = self._max_youtube_results
 1465                 return self._get_n_results(query, n)
 1466             except ValueError: # parsing prefix as integer fails
 1467                 return self._get_n_results(query, 1)
 1468 
 1469     def _get_n_results(self, query, n):
 1470         """Get a specified number of results for a query"""
 1471 
 1472         video_ids = []
 1473         pagenum = 0
 1474         limit = n
 1475 
 1476         while (50 * pagenum) < limit:
 1477             self.report_download_page(query, pagenum+1)
 1478             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
 1479             request = compat_urllib_request.Request(result_url)
 1480             try:
 1481                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
 1482             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1483                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
 1484                 return
 1485             api_response = json.loads(data)['data']
 1486 
 1487             if not 'items' in api_response:
 1488                 self._downloader.report_error(u'[youtube] No video results')
 1489                 return
 1490 
 1491             new_ids = list(video['id'] for video in api_response['items'])
 1492             video_ids += new_ids
 1493 
 1494             limit = min(n, api_response['totalItems'])
 1495             pagenum += 1
 1496 
 1497         if len(video_ids) > n:
 1498             video_ids = video_ids[:n]
 1499         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
 1500         return videos
 1501 
 1502 
 1503 class GoogleSearchIE(InfoExtractor):
 1504     """Information Extractor for Google Video search queries."""
 1505     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
 1506     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
 1507     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
 1508     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
 1509     _max_google_results = 1000
 1510     IE_NAME = u'video.google:search'
 1511 
 1512     def report_download_page(self, query, pagenum):
 1513         """Report attempt to download playlist page with given number."""
 1514         query = query.decode(preferredencoding())
 1515         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
 1516 
 1517     def _real_extract(self, query):
 1518         mobj = re.match(self._VALID_URL, query)
 1519         if mobj is None:
 1520             self._downloader.report_error(u'invalid search query "%s"' % query)
 1521             return
 1522 
 1523         prefix, query = query.split(':')
 1524         prefix = prefix[8:]
 1525         query = query.encode('utf-8')
 1526         if prefix == '':
 1527             self._download_n_results(query, 1)
 1528             return
 1529         elif prefix == 'all':
 1530             self._download_n_results(query, self._max_google_results)
 1531             return
 1532         else:
 1533             try:
 1534                 n = int(prefix)
 1535                 if n <= 0:
 1536                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
 1537                     return
 1538                 elif n > self._max_google_results:
 1539                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
 1540                     n = self._max_google_results
 1541                 self._download_n_results(query, n)
 1542                 return
 1543             except ValueError: # parsing prefix as integer fails
 1544                 self._download_n_results(query, 1)
 1545                 return
 1546 
 1547     def _download_n_results(self, query, n):
 1548         """Downloads a specified number of results for a query"""
 1549 
 1550         video_ids = []
 1551         pagenum = 0
 1552 
 1553         while True:
 1554             self.report_download_page(query, pagenum)
 1555             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
 1556             request = compat_urllib_request.Request(result_url)
 1557             try:
 1558                 page = compat_urllib_request.urlopen(request).read()
 1559             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1560                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 1561                 return
 1562 
 1563             # Extract video identifiers
 1564             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 1565                 video_id = mobj.group(1)
 1566                 if video_id not in video_ids:
 1567                     video_ids.append(video_id)
 1568                     if len(video_ids) == n:
 1569                         # Specified n videos reached
 1570                         for id in video_ids:
 1571                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
 1572                         return
 1573 
 1574             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
 1575                 for id in video_ids:
 1576                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
 1577                 return
 1578 
 1579             pagenum = pagenum + 1
 1580 
 1581 
 1582 class YahooSearchIE(InfoExtractor):
 1583     """Information Extractor for Yahoo! Video search queries."""
 1584 
 1585     _WORKING = False
 1586     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
 1587     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
 1588     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
 1589     _MORE_PAGES_INDICATOR = r'\s*Next'
 1590     _max_yahoo_results = 1000
 1591     IE_NAME = u'video.yahoo:search'
 1592 
 1593     def report_download_page(self, query, pagenum):
 1594         """Report attempt to download playlist page with given number."""
 1595         query = query.decode(preferredencoding())
 1596         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
 1597 
 1598     def _real_extract(self, query):
 1599         mobj = re.match(self._VALID_URL, query)
 1600         if mobj is None:
 1601             self._downloader.report_error(u'invalid search query "%s"' % query)
 1602             return
 1603 
 1604         prefix, query = query.split(':')
 1605         prefix = prefix[8:]
 1606         query = query.encode('utf-8')
 1607         if prefix == '':
 1608             self._download_n_results(query, 1)
 1609             return
 1610         elif prefix == 'all':
 1611             self._download_n_results(query, self._max_yahoo_results)
 1612             return
 1613         else:
 1614             try:
 1615                 n = int(prefix)
 1616                 if n <= 0:
 1617                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
 1618                     return
 1619                 elif n > self._max_yahoo_results:
 1620                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
 1621                     n = self._max_yahoo_results
 1622                 self._download_n_results(query, n)
 1623                 return
 1624             except ValueError: # parsing prefix as integer fails
 1625                 self._download_n_results(query, 1)
 1626                 return
 1627 
 1628     def _download_n_results(self, query, n):
 1629         """Downloads a specified number of results for a query"""
 1630 
 1631         video_ids = []
 1632         already_seen = set()
 1633         pagenum = 1
 1634 
 1635         while True:
 1636             self.report_download_page(query, pagenum)
 1637             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
 1638             request = compat_urllib_request.Request(result_url)
 1639             try:
 1640                 page = compat_urllib_request.urlopen(request).read()
 1641             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1642                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 1643                 return
 1644 
 1645             # Extract video identifiers
 1646             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 1647                 video_id = mobj.group(1)
 1648                 if video_id not in already_seen:
 1649                     video_ids.append(video_id)
 1650                     already_seen.add(video_id)
 1651                     if len(video_ids) == n:
 1652                         # Specified n videos reached
 1653                         for id in video_ids:
 1654                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
 1655                         return
 1656 
 1657             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
 1658                 for id in video_ids:
 1659                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
 1660                 return
 1661 
 1662             pagenum = pagenum + 1
 1663 
 1664 
 1665 class YoutubePlaylistIE(InfoExtractor):
 1666     """Information Extractor for YouTube playlists."""
 1667 
 1668     _VALID_URL = r"""(?:
 1669                         (?:https?://)?
 1670                         (?:\w+\.)?
 1671                         youtube\.com/
 1672                         (?:
 1673                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
 1674                            \? (?:.*?&)*? (?:p|a|list)=
 1675                         |  p/
 1676                         )
 1677                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
 1678                         .*
 1679                      |
 1680                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
 1681                      )"""
 1682     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
 1683     _MAX_RESULTS = 50
 1684     IE_NAME = u'youtube:playlist'
 1685 
 1686     @classmethod
 1687     def suitable(cls, url):
 1688         """Receives a URL and returns True if suitable for this IE."""
 1689         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 1690 
 1691     def report_download_page(self, playlist_id, pagenum):
 1692         """Report attempt to download playlist page with given number."""
 1693         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 1694 
 1695     def _real_extract(self, url):
 1696         # Extract playlist id
 1697         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 1698         if mobj is None:
 1699             self._downloader.report_error(u'invalid url: %s' % url)
 1700             return
 1701 
 1702         # Download playlist videos from API
 1703         playlist_id = mobj.group(1) or mobj.group(2)
 1704         page_num = 1
 1705         videos = []
 1706 
 1707         while True:
 1708             self.report_download_page(playlist_id, page_num)
 1709 
 1710             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
 1711             try:
 1712                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
 1713             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1714                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 1715                 return
 1716 
 1717             try:
 1718                 response = json.loads(page)
 1719             except ValueError as err:
 1720                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
 1721                 return
 1722 
 1723             if 'feed' not in response:
 1724                 self._downloader.report_error(u'Got a malformed response from YouTube API')
 1725                 return
 1726             if 'entry' not in response['feed']:
 1727                 # Number of videos is a multiple of self._MAX_RESULTS
 1728                 break
 1729 
 1730             playlist_title = response['feed']['title']['$t']
 1731 
 1732             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
 1733                         for entry in response['feed']['entry']
 1734                         if 'content' in entry ]
 1735 
 1736             if len(response['feed']['entry']) < self._MAX_RESULTS:
 1737                 break
 1738             page_num += 1
 1739 
 1740         videos = [v[1] for v in sorted(videos)]
 1741 
 1742         url_results = [self.url_result(url, 'Youtube') for url in videos]
 1743         return [self.playlist_result(url_results, playlist_id, playlist_title)]
 1744 
 1745 
 1746 class YoutubeChannelIE(InfoExtractor):
 1747     """Information Extractor for YouTube channels."""
 1748 
 1749     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
 1750     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
 1751     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
 1752     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
 1753     IE_NAME = u'youtube:channel'
 1754 
 1755     def report_download_page(self, channel_id, pagenum):
 1756         """Report attempt to download channel page with given number."""
 1757         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
 1758 
 1759     def extract_videos_from_page(self, page):
 1760         ids_in_page = []
 1761         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
 1762             if mobj.group(1) not in ids_in_page:
 1763                 ids_in_page.append(mobj.group(1))
 1764         return ids_in_page
 1765 
 1766     def _real_extract(self, url):
 1767         # Extract channel id
 1768         mobj = re.match(self._VALID_URL, url)
 1769         if mobj is None:
 1770             self._downloader.report_error(u'invalid url: %s' % url)
 1771             return
 1772 
 1773         # Download channel page
 1774         channel_id = mobj.group(1)
 1775         video_ids = []
 1776         pagenum = 1
 1777 
 1778         self.report_download_page(channel_id, pagenum)
 1779         url = self._TEMPLATE_URL % (channel_id, pagenum)
 1780         request = compat_urllib_request.Request(url)
 1781         try:
 1782             page = compat_urllib_request.urlopen(request).read().decode('utf8')
 1783         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1784             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 1785             return
 1786 
 1787         # Extract video identifiers
 1788         ids_in_page = self.extract_videos_from_page(page)
 1789         video_ids.extend(ids_in_page)
 1790 
 1791         # Download any subsequent channel pages using the json-based channel_ajax query
 1792         if self._MORE_PAGES_INDICATOR in page:
 1793             while True:
 1794                 pagenum = pagenum + 1
 1795 
 1796                 self.report_download_page(channel_id, pagenum)
 1797                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
 1798                 request = compat_urllib_request.Request(url)
 1799                 try:
 1800                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
 1801                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1802                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 1803                     return
 1804 
 1805                 page = json.loads(page)
 1806 
 1807                 ids_in_page = self.extract_videos_from_page(page['content_html'])
 1808                 video_ids.extend(ids_in_page)
 1809 
 1810                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
 1811                     break
 1812 
 1813         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
 1814 
 1815         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
 1816         url_entries = [self.url_result(url, 'Youtube') for url in urls]
 1817         return [self.playlist_result(url_entries, channel_id)]
 1818 
 1819 
 1820 class YoutubeUserIE(InfoExtractor):
 1821     """Information Extractor for YouTube users."""
 1822 
 1823     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
 1824     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
 1825     _GDATA_PAGE_SIZE = 50
 1826     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
 1827     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
 1828     IE_NAME = u'youtube:user'
 1829 
 1830     def report_download_page(self, username, start_index):
 1831         """Report attempt to download user page."""
 1832         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
 1833                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
 1834 
 1835     def _real_extract(self, url):
 1836         # Extract username
 1837         mobj = re.match(self._VALID_URL, url)
 1838         if mobj is None:
 1839             self._downloader.report_error(u'invalid url: %s' % url)
 1840             return
 1841 
 1842         username = mobj.group(1)
 1843 
 1844         # Download video ids using YouTube Data API. Result size per
 1845         # query is limited (currently to 50 videos) so we need to query
 1846         # page by page until there are no video ids - it means we got
 1847         # all of them.
 1848 
 1849         video_ids = []
 1850         pagenum = 0
 1851 
 1852         while True:
 1853             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
 1854             self.report_download_page(username, start_index)
 1855 
 1856             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
 1857 
 1858             try:
 1859                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 1860             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1861                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 1862                 return
 1863 
 1864             # Extract video identifiers
 1865             ids_in_page = []
 1866 
 1867             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 1868                 if mobj.group(1) not in ids_in_page:
 1869                     ids_in_page.append(mobj.group(1))
 1870 
 1871             video_ids.extend(ids_in_page)
 1872 
 1873             # A little optimization - if current page is not
 1874             # "full", ie. does not contain PAGE_SIZE video ids then
 1875             # we can assume that this page is the last one - there
 1876             # are no more ids on further pages - no need to query
 1877             # again.
 1878 
 1879             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
 1880                 break
 1881 
 1882             pagenum += 1
 1883 
 1884         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
 1885         url_results = [self.url_result(url, 'Youtube') for url in urls]
 1886         return [self.playlist_result(url_results, playlist_title = username)]
 1887 
 1888 
 1889 class BlipTVUserIE(InfoExtractor):
 1890     """Information Extractor for blip.tv users."""
 1891 
 1892     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 1893     _PAGE_SIZE = 12
 1894     IE_NAME = u'blip.tv:user'
 1895 
 1896     def report_download_page(self, username, pagenum):
 1897         """Report attempt to download user page."""
 1898         self.to_screen(u'user %s: Downloading video ids from page %d' %
 1899                 (username, pagenum))
 1900 
 1901     def _real_extract(self, url):
 1902         # Extract username
 1903         mobj = re.match(self._VALID_URL, url)
 1904         if mobj is None:
 1905             self._downloader.report_error(u'invalid url: %s' % url)
 1906             return
 1907 
 1908         username = mobj.group(1)
 1909 
 1910         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 1911 
 1912         request = compat_urllib_request.Request(url)
 1913 
 1914         try:
 1915             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 1916             mobj = re.search(r'data-users-id="([^"]+)"', page)
 1917             page_base = page_base % mobj.group(1)
 1918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1919             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 1920             return
 1921 
 1922 
 1923         # Download video ids using BlipTV Ajax calls. Result size per
 1924         # query is limited (currently to 12 videos) so we need to query
 1925         # page by page until there are no video ids - it means we got
 1926         # all of them.
 1927 
 1928         video_ids = []
 1929         pagenum = 1
 1930 
 1931         while True:
 1932             self.report_download_page(username, pagenum)
 1933             url = page_base + "&page=" + str(pagenum)
 1934             request = compat_urllib_request.Request( url )
 1935             try:
 1936                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 1937             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1938                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
 1939                 return
 1940 
 1941             # Extract video identifiers
 1942             ids_in_page = []
 1943 
 1944             for mobj in re.finditer(r'href="/([^"]+)"', page):
 1945                 if mobj.group(1) not in ids_in_page:
 1946                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 1947 
 1948             video_ids.extend(ids_in_page)
 1949 
 1950             # A little optimization - if current page is not
 1951             # "full", ie. does not contain PAGE_SIZE video ids then
 1952             # we can assume that this page is the last one - there
 1953             # are no more ids on further pages - no need to query
 1954             # again.
 1955 
 1956             if len(ids_in_page) < self._PAGE_SIZE:
 1957                 break
 1958 
 1959             pagenum += 1
 1960 
 1961         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 1962         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 1963         return [self.playlist_result(url_entries, playlist_title = username)]
 1964 
 1965 
 1966 class DepositFilesIE(InfoExtractor):
 1967     """Information extractor for depositfiles.com"""
 1968 
 1969     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 1970 
 1971     def _real_extract(self, url):
 1972         file_id = url.split('/')[-1]
 1973         # Rebuild url in english locale
 1974         url = 'http://depositfiles.com/en/files/' + file_id
 1975 
 1976         # Retrieve file webpage with 'Free download' button pressed
 1977         free_download_indication = { 'gateway_result' : '1' }
 1978         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 1979         try:
 1980             self.report_download_webpage(file_id)
 1981             webpage = compat_urllib_request.urlopen(request).read()
 1982         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 1983             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
 1984             return
 1985 
 1986         # Search for the real file URL
 1987         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 1988         if (mobj is None) or (mobj.group(1) is None):
 1989             # Try to figure out reason of the error.
 1990             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 1991             if (mobj is not None) and (mobj.group(1) is not None):
 1992                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 1993                 self._downloader.report_error(u'%s' % restriction_message)
 1994             else:
 1995                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
 1996             return
 1997 
 1998         file_url = mobj.group(1)
 1999         file_extension = os.path.splitext(file_url)[1][1:]
 2000 
 2001         # Search for file title
 2002         mobj = re.search(r'<b title="(.*?)">', webpage)
 2003         if mobj is None:
 2004             self._downloader.report_error(u'unable to extract title')
 2005             return
 2006         file_title = mobj.group(1).decode('utf-8')
 2007 
 2008         return [{
 2009             'id':       file_id.decode('utf-8'),
 2010             'url':      file_url.decode('utf-8'),
 2011             'uploader': None,
 2012             'upload_date':  None,
 2013             'title':    file_title,
 2014             'ext':      file_extension.decode('utf-8'),
 2015         }]
 2016 
 2017 
 2018 class FacebookIE(InfoExtractor):
 2019     """Information Extractor for Facebook"""
 2020 
 2021     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 2022     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 2023     _NETRC_MACHINE = 'facebook'
 2024     IE_NAME = u'facebook'
 2025 
 2026     def report_login(self):
 2027         """Report attempt to log in."""
 2028         self.to_screen(u'Logging in')
 2029 
 2030     def _real_initialize(self):
 2031         if self._downloader is None:
 2032             return
 2033 
 2034         useremail = None
 2035         password = None
 2036         downloader_params = self._downloader.params
 2037 
 2038         # Attempt to use provided username and password or .netrc data
 2039         if downloader_params.get('username', None) is not None:
 2040             useremail = downloader_params['username']
 2041             password = downloader_params['password']
 2042         elif downloader_params.get('usenetrc', False):
 2043             try:
 2044                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 2045                 if info is not None:
 2046                     useremail = info[0]
 2047                     password = info[2]
 2048                 else:
 2049                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 2050             except (IOError, netrc.NetrcParseError) as err:
 2051                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 2052                 return
 2053 
 2054         if useremail is None:
 2055             return
 2056 
 2057         # Log in
 2058         login_form = {
 2059             'email': useremail,
 2060             'pass': password,
 2061             'login': 'Log+In'
 2062             }
 2063         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 2064         try:
 2065             self.report_login()
 2066             login_results = compat_urllib_request.urlopen(request).read()
 2067             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 2068                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 2069                 return
 2070         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2071             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 2072             return
 2073 
 2074     def _real_extract(self, url):
 2075         mobj = re.match(self._VALID_URL, url)
 2076         if mobj is None:
 2077             self._downloader.report_error(u'invalid URL: %s' % url)
 2078             return
 2079         video_id = mobj.group('ID')
 2080 
 2081         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 2082         webpage = self._download_webpage(url, video_id)
 2083 
 2084         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 2085         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 2086         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 2087         if not m:
 2088             raise ExtractorError(u'Cannot parse data')
 2089         data = dict(json.loads(m.group(1)))
 2090         params_raw = compat_urllib_parse.unquote(data['params'])
 2091         params = json.loads(params_raw)
 2092         video_data = params['video_data'][0]
 2093         video_url = video_data.get('hd_src')
 2094         if not video_url:
 2095             video_url = video_data['sd_src']
 2096         if not video_url:
 2097             raise ExtractorError(u'Cannot find video URL')
 2098         video_duration = int(video_data['video_duration'])
 2099         thumbnail = video_data['thumbnail_src']
 2100 
 2101         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
 2102         if not m:
 2103             raise ExtractorError(u'Cannot find title in webpage')
 2104         video_title = unescapeHTML(m.group(1))
 2105 
 2106         info = {
 2107             'id': video_id,
 2108             'title': video_title,
 2109             'url': video_url,
 2110             'ext': 'mp4',
 2111             'duration': video_duration,
 2112             'thumbnail': thumbnail,
 2113         }
 2114         return [info]
 2115 
 2116 
 2117 class BlipTVIE(InfoExtractor):
 2118     """Information extractor for blip.tv"""
 2119 
 2120     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
 2121     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 2122     IE_NAME = u'blip.tv'
 2123 
 2124     def report_direct_download(self, title):
 2125         """Report information extraction."""
 2126         self.to_screen(u'%s: Direct download detected' % title)
 2127 
 2128     def _real_extract(self, url):
 2129         mobj = re.match(self._VALID_URL, url)
 2130         if mobj is None:
 2131             self._downloader.report_error(u'invalid URL: %s' % url)
 2132             return
 2133 
 2134         urlp = compat_urllib_parse_urlparse(url)
 2135         if urlp.path.startswith('/play/'):
 2136             request = compat_urllib_request.Request(url)
 2137             response = compat_urllib_request.urlopen(request)
 2138             redirecturl = response.geturl()
 2139             rurlp = compat_urllib_parse_urlparse(redirecturl)
 2140             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 2141             url = 'http://blip.tv/a/a-' + file_id
 2142             return self._real_extract(url)
 2143 
 2144 
 2145         if '?' in url:
 2146             cchar = '&'
 2147         else:
 2148             cchar = '?'
 2149         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 2150         request = compat_urllib_request.Request(json_url)
 2151         request.add_header('User-Agent', 'iTunes/10.6.1')
 2152         self.report_extraction(mobj.group(1))
 2153         info = None
 2154         try:
 2155             urlh = compat_urllib_request.urlopen(request)
 2156             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 2157                 basename = url.split('/')[-1]
 2158                 title,ext = os.path.splitext(basename)
 2159                 title = title.decode('UTF-8')
 2160                 ext = ext.replace('.', '')
 2161                 self.report_direct_download(title)
 2162                 info = {
 2163                     'id': title,
 2164                     'url': url,
 2165                     'uploader': None,
 2166                     'upload_date': None,
 2167                     'title': title,
 2168                     'ext': ext,
 2169                     'urlhandle': urlh
 2170                 }
 2171         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2172             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 2173         if info is None: # Regular URL
 2174             try:
 2175                 json_code_bytes = urlh.read()
 2176                 json_code = json_code_bytes.decode('utf-8')
 2177             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2178                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
 2179                 return
 2180 
 2181             try:
 2182                 json_data = json.loads(json_code)
 2183                 if 'Post' in json_data:
 2184                     data = json_data['Post']
 2185                 else:
 2186                     data = json_data
 2187 
 2188                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 2189                 video_url = data['media']['url']
 2190                 umobj = re.match(self._URL_EXT, video_url)
 2191                 if umobj is None:
 2192                     raise ValueError('Can not determine filename extension')
 2193                 ext = umobj.group(1)
 2194 
 2195                 info = {
 2196                     'id': data['item_id'],
 2197                     'url': video_url,
 2198                     'uploader': data['display_name'],
 2199                     'upload_date': upload_date,
 2200                     'title': data['title'],
 2201                     'ext': ext,
 2202                     'format': data['media']['mimeType'],
 2203                     'thumbnail': data['thumbnailUrl'],
 2204                     'description': data['description'],
 2205                     'player_url': data['embedUrl'],
 2206                     'user_agent': 'iTunes/10.6.1',
 2207                 }
 2208             except (ValueError,KeyError) as err:
 2209                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
 2210                 return
 2211 
 2212         return [info]
 2213 
 2214 
 2215 class MyVideoIE(InfoExtractor):
 2216     """Information Extractor for myvideo.de."""
 2217 
 2218     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 2219     IE_NAME = u'myvideo'
 2220 
 2221     def _real_extract(self,url):
 2222         mobj = re.match(self._VALID_URL, url)
 2223         if mobj is None:
 2224             self._download.report_error(u'invalid URL: %s' % url)
 2225             return
 2226 
 2227         video_id = mobj.group(1)
 2228 
 2229         # Get video webpage
 2230         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 2231         webpage = self._download_webpage(webpage_url, video_id)
 2232 
 2233         self.report_extraction(video_id)
 2234         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
 2235                  webpage)
 2236         if mobj is None:
 2237             self._downloader.report_error(u'unable to extract media URL')
 2238             return
 2239         video_url = mobj.group(1) + ('/%s.flv' % video_id)
 2240 
 2241         mobj = re.search('<title>([^<]+)</title>', webpage)
 2242         if mobj is None:
 2243             self._downloader.report_error(u'unable to extract title')
 2244             return
 2245 
 2246         video_title = mobj.group(1)
 2247 
 2248         return [{
 2249             'id':       video_id,
 2250             'url':      video_url,
 2251             'uploader': None,
 2252             'upload_date':  None,
 2253             'title':    video_title,
 2254             'ext':      u'flv',
 2255         }]
 2256 
 2257 class ComedyCentralIE(InfoExtractor):
 2258     """Information extractor for The Daily Show and Colbert Report """
 2259 
 2260     # urls can be abbreviations like :thedailyshow or :colbert
 2261     # urls for episodes like:
 2262     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 2263     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 2264     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 2265     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 2266                       |(https?://)?(www\.)?
 2267                           (?P<showname>thedailyshow|colbertnation)\.com/
 2268                          (full-episodes/(?P<episode>.*)|
 2269                           (?P<clip>
 2270                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 2271                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 2272                      $"""
 2273 
 2274     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 2275 
 2276     _video_extensions = {
 2277         '3500': 'mp4',
 2278         '2200': 'mp4',
 2279         '1700': 'mp4',
 2280         '1200': 'mp4',
 2281         '750': 'mp4',
 2282         '400': 'mp4',
 2283     }
 2284     _video_dimensions = {
 2285         '3500': '1280x720',
 2286         '2200': '960x540',
 2287         '1700': '768x432',
 2288         '1200': '640x360',
 2289         '750': '512x288',
 2290         '400': '384x216',
 2291     }
 2292 
 2293     @classmethod
 2294     def suitable(cls, url):
 2295         """Receives a URL and returns True if suitable for this IE."""
 2296         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 2297 
 2298     def report_config_download(self, episode_id, media_id):
 2299         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
 2300 
 2301     def report_index_download(self, episode_id):
 2302         self.to_screen(u'%s: Downloading show index' % episode_id)
 2303 
 2304     def _print_formats(self, formats):
 2305         print('Available formats:')
 2306         for x in formats:
 2307             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 2308 
 2309 
 2310     def _real_extract(self, url):
 2311         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 2312         if mobj is None:
 2313             self._downloader.report_error(u'invalid URL: %s' % url)
 2314             return
 2315 
 2316         if mobj.group('shortname'):
 2317             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 2318                 url = u'http://www.thedailyshow.com/full-episodes/'
 2319             else:
 2320                 url = u'http://www.colbertnation.com/full-episodes/'
 2321             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 2322             assert mobj is not None
 2323 
 2324         if mobj.group('clip'):
 2325             if mobj.group('showname') == 'thedailyshow':
 2326                 epTitle = mobj.group('tdstitle')
 2327             else:
 2328                 epTitle = mobj.group('cntitle')
 2329             dlNewest = False
 2330         else:
 2331             dlNewest = not mobj.group('episode')
 2332             if dlNewest:
 2333                 epTitle = mobj.group('showname')
 2334             else:
 2335                 epTitle = mobj.group('episode')
 2336 
 2337         req = compat_urllib_request.Request(url)
 2338         self.report_extraction(epTitle)
 2339         try:
 2340             htmlHandle = compat_urllib_request.urlopen(req)
 2341             html = htmlHandle.read()
 2342             webpage = html.decode('utf-8')
 2343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2344             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 2345             return
 2346         if dlNewest:
 2347             url = htmlHandle.geturl()
 2348             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 2349             if mobj is None:
 2350                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
 2351                 return
 2352             if mobj.group('episode') == '':
 2353                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
 2354                 return
 2355             epTitle = mobj.group('episode')
 2356 
 2357         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 2358 
 2359         if len(mMovieParams) == 0:
 2360             # The Colbert Report embeds the information in a without
 2361             # a URL prefix; so extract the alternate reference
 2362             # and then add the URL prefix manually.
 2363 
 2364             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 2365             if len(altMovieParams) == 0:
 2366                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
 2367                 return
 2368             else:
 2369                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 2370 
 2371         uri = mMovieParams[0][1]
 2372         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 2373         self.report_index_download(epTitle)
 2374         try:
 2375             indexXml = compat_urllib_request.urlopen(indexUrl).read()
 2376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2377             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
 2378             return
 2379 
 2380         results = []
 2381 
 2382         idoc = xml.etree.ElementTree.fromstring(indexXml)
 2383         itemEls = idoc.findall('.//item')
 2384         for partNum,itemEl in enumerate(itemEls):
 2385             mediaId = itemEl.findall('./guid')[0].text
 2386             shortMediaId = mediaId.split(':')[-1]
 2387             showId = mediaId.split(':')[-2].replace('.com', '')
 2388             officialTitle = itemEl.findall('./title')[0].text
 2389             officialDate = itemEl.findall('./pubDate')[0].text
 2390 
 2391             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 2392                         compat_urllib_parse.urlencode({'uri': mediaId}))
 2393             configReq = compat_urllib_request.Request(configUrl)
 2394             self.report_config_download(epTitle, shortMediaId)
 2395             try:
 2396                 configXml = compat_urllib_request.urlopen(configReq).read()
 2397             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2398                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
 2399                 return
 2400 
 2401             cdoc = xml.etree.ElementTree.fromstring(configXml)
 2402             turls = []
 2403             for rendition in cdoc.findall('.//rendition'):
 2404                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 2405                 turls.append(finfo)
 2406 
 2407             if len(turls) == 0:
 2408                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 2409                 continue
 2410 
 2411             if self._downloader.params.get('listformats', None):
 2412                 self._print_formats([i[0] for i in turls])
 2413                 return
 2414 
 2415             # For now, just pick the highest bitrate
 2416             format,rtmp_video_url = turls[-1]
 2417 
 2418             # Get the format arg from the arg stream
 2419             req_format = self._downloader.params.get('format', None)
 2420 
 2421             # Select format if we can find one
 2422             for f,v in turls:
 2423                 if f == req_format:
 2424                     format, rtmp_video_url = f, v
 2425                     break
 2426 
 2427             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 2428             if not m:
 2429                 raise ExtractorError(u'Cannot transform RTMP url')
 2430             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 2431             video_url = base + m.group('finalid')
 2432 
 2433             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 2434             info = {
 2435                 'id': shortMediaId,
 2436                 'url': video_url,
 2437                 'uploader': showId,
 2438                 'upload_date': officialDate,
 2439                 'title': effTitle,
 2440                 'ext': 'mp4',
 2441                 'format': format,
 2442                 'thumbnail': None,
 2443                 'description': officialTitle,
 2444             }
 2445             results.append(info)
 2446 
 2447         return results
 2448 
 2449 
 2450 class EscapistIE(InfoExtractor):
 2451     """Information extractor for The Escapist """
 2452 
 2453     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 2454     IE_NAME = u'escapist'
 2455 
 2456     def report_config_download(self, showName):
 2457         self.to_screen(u'%s: Downloading configuration' % showName)
 2458 
 2459     def _real_extract(self, url):
 2460         mobj = re.match(self._VALID_URL, url)
 2461         if mobj is None:
 2462             self._downloader.report_error(u'invalid URL: %s' % url)
 2463             return
 2464         showName = mobj.group('showname')
 2465         videoId = mobj.group('episode')
 2466 
 2467         self.report_extraction(showName)
 2468         try:
 2469             webPage = compat_urllib_request.urlopen(url)
 2470             webPageBytes = webPage.read()
 2471             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
 2472             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
 2473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2474             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
 2475             return
 2476 
 2477         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
 2478         description = unescapeHTML(descMatch.group(1))
 2479         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
 2480         imgUrl = unescapeHTML(imgMatch.group(1))
 2481         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
 2482         playerUrl = unescapeHTML(playerUrlMatch.group(1))
 2483         configUrlMatch = re.search('config=(.*)$', playerUrl)
 2484         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
 2485 
 2486         self.report_config_download(showName)
 2487         try:
 2488             configJSON = compat_urllib_request.urlopen(configUrl)
 2489             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
 2490             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
 2491         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2492             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
 2493             return
 2494 
 2495         # Technically, it's JavaScript, not JSON
 2496         configJSON = configJSON.replace("'", '"')
 2497 
 2498         try:
 2499             config = json.loads(configJSON)
 2500         except (ValueError,) as err:
 2501             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
 2502             return
 2503 
 2504         playlist = config['playlist']
 2505         videoUrl = playlist[1]['url']
 2506 
 2507         info = {
 2508             'id': videoId,
 2509             'url': videoUrl,
 2510             'uploader': showName,
 2511             'upload_date': None,
 2512             'title': showName,
 2513             'ext': 'mp4',
 2514             'thumbnail': imgUrl,
 2515             'description': description,
 2516             'player_url': playerUrl,
 2517         }
 2518 
 2519         return [info]
 2520 
 2521 class CollegeHumorIE(InfoExtractor):
 2522     """Information extractor for collegehumor.com"""
 2523 
 2524     _WORKING = False
 2525     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 2526     IE_NAME = u'collegehumor'
 2527 
 2528     def report_manifest(self, video_id):
 2529         """Report information extraction."""
 2530         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 2531 
 2532     def _real_extract(self, url):
 2533         mobj = re.match(self._VALID_URL, url)
 2534         if mobj is None:
 2535             self._downloader.report_error(u'invalid URL: %s' % url)
 2536             return
 2537         video_id = mobj.group('videoid')
 2538 
 2539         info = {
 2540             'id': video_id,
 2541             'uploader': None,
 2542             'upload_date': None,
 2543         }
 2544 
 2545         self.report_extraction(video_id)
 2546         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 2547         try:
 2548             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 2549         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2550             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
 2551             return
 2552 
 2553         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 2554         try:
 2555             videoNode = mdoc.findall('./video')[0]
 2556             info['description'] = videoNode.findall('./description')[0].text
 2557             info['title'] = videoNode.findall('./caption')[0].text
 2558             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 2559             manifest_url = videoNode.findall('./file')[0].text
 2560         except IndexError:
 2561             self._downloader.report_error(u'Invalid metadata XML file')
 2562             return
 2563 
 2564         manifest_url += '?hdcore=2.10.3'
 2565         self.report_manifest(video_id)
 2566         try:
 2567             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 2568         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2569             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
 2570             return
 2571 
 2572         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 2573         try:
 2574             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 2575             node_id = media_node.attrib['url']
 2576             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 2577         except IndexError as err:
 2578             self._downloader.report_error(u'Invalid manifest file')
 2579             return
 2580 
 2581         url_pr = compat_urllib_parse_urlparse(manifest_url)
 2582         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 2583 
 2584         info['url'] = url
 2585         info['ext'] = 'f4f'
 2586         return [info]
 2587 
 2588 
 2589 class XVideosIE(InfoExtractor):
 2590     """Information extractor for xvideos.com"""
 2591 
 2592     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 2593     IE_NAME = u'xvideos'
 2594 
 2595     def _real_extract(self, url):
 2596         mobj = re.match(self._VALID_URL, url)
 2597         if mobj is None:
 2598             self._downloader.report_error(u'invalid URL: %s' % url)
 2599             return
 2600         video_id = mobj.group(1)
 2601 
 2602         webpage = self._download_webpage(url, video_id)
 2603 
 2604         self.report_extraction(video_id)
 2605 
 2606 
 2607         # Extract video URL
 2608         mobj = re.search(r'flv_url=(.+?)&', webpage)
 2609         if mobj is None:
 2610             self._downloader.report_error(u'unable to extract video url')
 2611             return
 2612         video_url = compat_urllib_parse.unquote(mobj.group(1))
 2613 
 2614 
 2615         # Extract title
 2616         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
 2617         if mobj is None:
 2618             self._downloader.report_error(u'unable to extract video title')
 2619             return
 2620         video_title = mobj.group(1)
 2621 
 2622 
 2623         # Extract video thumbnail
 2624         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
 2625         if mobj is None:
 2626             self._downloader.report_error(u'unable to extract video thumbnail')
 2627             return
 2628         video_thumbnail = mobj.group(0)
 2629 
 2630         info = {
 2631             'id': video_id,
 2632             'url': video_url,
 2633             'uploader': None,
 2634             'upload_date': None,
 2635             'title': video_title,
 2636             'ext': 'flv',
 2637             'thumbnail': video_thumbnail,
 2638             'description': None,
 2639         }
 2640 
 2641         return [info]
 2642 
 2643 
 2644 class SoundcloudIE(InfoExtractor):
 2645     """Information extractor for soundcloud.com
 2646        To access the media, the uid of the song and a stream token
 2647        must be extracted from the page source and the script must make
 2648        a request to media.soundcloud.com/crossdomain.xml. Then
 2649        the media can be grabbed by requesting from an url composed
 2650        of the stream token and uid
 2651      """
 2652 
 2653     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 2654     IE_NAME = u'soundcloud'
 2655 
 2656     def report_resolve(self, video_id):
 2657         """Report information extraction."""
 2658         self.to_screen(u'%s: Resolving id' % video_id)
 2659 
 2660     def _real_extract(self, url):
 2661         mobj = re.match(self._VALID_URL, url)
 2662         if mobj is None:
 2663             self._downloader.report_error(u'invalid URL: %s' % url)
 2664             return
 2665 
 2666         # extract uploader (which is in the url)
 2667         uploader = mobj.group(1)
 2668         # extract simple title (uploader + slug of song title)
 2669         slug_title =  mobj.group(2)
 2670         simple_title = uploader + u'-' + slug_title
 2671 
 2672         self.report_resolve('%s/%s' % (uploader, slug_title))
 2673 
 2674         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 2675         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 2676         request = compat_urllib_request.Request(resolv_url)
 2677         try:
 2678             info_json_bytes = compat_urllib_request.urlopen(request).read()
 2679             info_json = info_json_bytes.decode('utf-8')
 2680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2681             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 2682             return
 2683 
 2684         info = json.loads(info_json)
 2685         video_id = info['id']
 2686         self.report_extraction('%s/%s' % (uploader, slug_title))
 2687 
 2688         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 2689         request = compat_urllib_request.Request(streams_url)
 2690         try:
 2691             stream_json_bytes = compat_urllib_request.urlopen(request).read()
 2692             stream_json = stream_json_bytes.decode('utf-8')
 2693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2694             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
 2695             return
 2696 
 2697         streams = json.loads(stream_json)
 2698         mediaURL = streams['http_mp3_128_url']
 2699 
 2700         return [{
 2701             'id':       info['id'],
 2702             'url':      mediaURL,
 2703             'uploader': info['user']['username'],
 2704             'upload_date':  info['created_at'],
 2705             'title':    info['title'],
 2706             'ext':      u'mp3',
 2707             'description': info['description'],
 2708         }]
 2709 
 2710 class SoundcloudSetIE(InfoExtractor):
 2711     """Information extractor for soundcloud.com sets
 2712        To access the media, the uid of the song and a stream token
 2713        must be extracted from the page source and the script must make
 2714        a request to media.soundcloud.com/crossdomain.xml. Then
 2715        the media can be grabbed by requesting from an url composed
 2716        of the stream token and uid
 2717      """
 2718 
 2719     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 2720     IE_NAME = u'soundcloud:set'
 2721 
 2722     def report_resolve(self, video_id):
 2723         """Report information extraction."""
 2724         self.to_screen(u'%s: Resolving id' % video_id)
 2725 
 2726     def _real_extract(self, url):
 2727         mobj = re.match(self._VALID_URL, url)
 2728         if mobj is None:
 2729             self._downloader.report_error(u'invalid URL: %s' % url)
 2730             return
 2731 
 2732         # extract uploader (which is in the url)
 2733         uploader = mobj.group(1)
 2734         # extract simple title (uploader + slug of song title)
 2735         slug_title =  mobj.group(2)
 2736         simple_title = uploader + u'-' + slug_title
 2737 
 2738         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
 2739 
 2740         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 2741         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 2742         request = compat_urllib_request.Request(resolv_url)
 2743         try:
 2744             info_json_bytes = compat_urllib_request.urlopen(request).read()
 2745             info_json = info_json_bytes.decode('utf-8')
 2746         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2747             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 2748             return
 2749 
 2750         videos = []
 2751         info = json.loads(info_json)
 2752         if 'errors' in info:
 2753             for err in info['errors']:
 2754                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 2755             return
 2756 
 2757         for track in info['tracks']:
 2758             video_id = track['id']
 2759             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
 2760 
 2761             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 2762             request = compat_urllib_request.Request(streams_url)
 2763             try:
 2764                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
 2765                 stream_json = stream_json_bytes.decode('utf-8')
 2766             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2767                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
 2768                 return
 2769 
 2770             streams = json.loads(stream_json)
 2771             mediaURL = streams['http_mp3_128_url']
 2772 
 2773             videos.append({
 2774                 'id':       video_id,
 2775                 'url':      mediaURL,
 2776                 'uploader': track['user']['username'],
 2777                 'upload_date':  track['created_at'],
 2778                 'title':    track['title'],
 2779                 'ext':      u'mp3',
 2780                 'description': track['description'],
 2781             })
 2782         return videos
 2783 
 2784 
 2785 class InfoQIE(InfoExtractor):
 2786     """Information extractor for infoq.com"""
 2787     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 2788 
 2789     def _real_extract(self, url):
 2790         mobj = re.match(self._VALID_URL, url)
 2791         if mobj is None:
 2792             self._downloader.report_error(u'invalid URL: %s' % url)
 2793             return
 2794 
 2795         webpage = self._download_webpage(url, video_id=url)
 2796         self.report_extraction(url)
 2797 
 2798         # Extract video URL
 2799         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 2800         if mobj is None:
 2801             self._downloader.report_error(u'unable to extract video url')
 2802             return
 2803         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 2804         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 2805 
 2806         # Extract title
 2807         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
 2808         if mobj is None:
 2809             self._downloader.report_error(u'unable to extract video title')
 2810             return
 2811         video_title = mobj.group(1)
 2812 
 2813         # Extract description
 2814         video_description = u'No description available.'
 2815         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
 2816         if mobj is not None:
 2817             video_description = mobj.group(1)
 2818 
 2819         video_filename = video_url.split('/')[-1]
 2820         video_id, extension = video_filename.split('.')
 2821 
 2822         info = {
 2823             'id': video_id,
 2824             'url': video_url,
 2825             'uploader': None,
 2826             'upload_date': None,
 2827             'title': video_title,
 2828             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 2829             'thumbnail': None,
 2830             'description': video_description,
 2831         }
 2832 
 2833         return [info]
 2834 
 2835 class MixcloudIE(InfoExtractor):
 2836     """Information extractor for www.mixcloud.com"""
 2837 
 2838     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 2839     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 2840     IE_NAME = u'mixcloud'
 2841 
 2842     def report_download_json(self, file_id):
 2843         """Report JSON download."""
 2844         self.to_screen(u'Downloading json')
 2845 
 2846     def get_urls(self, jsonData, fmt, bitrate='best'):
 2847         """Get urls from 'audio_formats' section in json"""
 2848         file_url = None
 2849         try:
 2850             bitrate_list = jsonData[fmt]
 2851             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 2852                 bitrate = max(bitrate_list) # select highest
 2853 
 2854             url_list = jsonData[fmt][bitrate]
 2855         except TypeError: # we have no bitrate info.
 2856             url_list = jsonData[fmt]
 2857         return url_list
 2858 
 2859     def check_urls(self, url_list):
 2860         """Returns 1st active url from list"""
 2861         for url in url_list:
 2862             try:
 2863                 compat_urllib_request.urlopen(url)
 2864                 return url
 2865             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2866                 url = None
 2867 
 2868         return None
 2869 
 2870     def _print_formats(self, formats):
 2871         print('Available formats:')
 2872         for fmt in formats.keys():
 2873             for b in formats[fmt]:
 2874                 try:
 2875                     ext = formats[fmt][b][0]
 2876                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 2877                 except TypeError: # we have no bitrate info
 2878                     ext = formats[fmt][0]
 2879                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 2880                     break
 2881 
 2882     def _real_extract(self, url):
 2883         mobj = re.match(self._VALID_URL, url)
 2884         if mobj is None:
 2885             self._downloader.report_error(u'invalid URL: %s' % url)
 2886             return
 2887         # extract uploader & filename from url
 2888         uploader = mobj.group(1).decode('utf-8')
 2889         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 2890 
 2891         # construct API request
 2892         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 2893         # retrieve .json file with links to files
 2894         request = compat_urllib_request.Request(file_url)
 2895         try:
 2896             self.report_download_json(file_url)
 2897             jsonData = compat_urllib_request.urlopen(request).read()
 2898         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2899             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
 2900             return
 2901 
 2902         # parse JSON
 2903         json_data = json.loads(jsonData)
 2904         player_url = json_data['player_swf_url']
 2905         formats = dict(json_data['audio_formats'])
 2906 
 2907         req_format = self._downloader.params.get('format', None)
 2908         bitrate = None
 2909 
 2910         if self._downloader.params.get('listformats', None):
 2911             self._print_formats(formats)
 2912             return
 2913 
 2914         if req_format is None or req_format == 'best':
 2915             for format_param in formats.keys():
 2916                 url_list = self.get_urls(formats, format_param)
 2917                 # check urls
 2918                 file_url = self.check_urls(url_list)
 2919                 if file_url is not None:
 2920                     break # got it!
 2921         else:
 2922             if req_format not in formats:
 2923                 self._downloader.report_error(u'format is not available')
 2924                 return
 2925 
 2926             url_list = self.get_urls(formats, req_format)
 2927             file_url = self.check_urls(url_list)
 2928             format_param = req_format
 2929 
 2930         return [{
 2931             'id': file_id.decode('utf-8'),
 2932             'url': file_url.decode('utf-8'),
 2933             'uploader': uploader.decode('utf-8'),
 2934             'upload_date': None,
 2935             'title': json_data['name'],
 2936             'ext': file_url.split('.')[-1].decode('utf-8'),
 2937             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 2938             'thumbnail': json_data['thumbnail_url'],
 2939             'description': json_data['description'],
 2940             'player_url': player_url.decode('utf-8'),
 2941         }]
 2942 
 2943 class StanfordOpenClassroomIE(InfoExtractor):
 2944     """Information extractor for Stanford's Open ClassRoom"""
 2945 
 2946     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 2947     IE_NAME = u'stanfordoc'
 2948 
 2949     def _real_extract(self, url):
 2950         mobj = re.match(self._VALID_URL, url)
 2951         if mobj is None:
 2952             raise ExtractorError(u'Invalid URL: %s' % url)
 2953 
 2954         if mobj.group('course') and mobj.group('video'): # A specific video
 2955             course = mobj.group('course')
 2956             video = mobj.group('video')
 2957             info = {
 2958                 'id': course + '_' + video,
 2959                 'uploader': None,
 2960                 'upload_date': None,
 2961             }
 2962 
 2963             self.report_extraction(info['id'])
 2964             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 2965             xmlUrl = baseUrl + video + '.xml'
 2966             try:
 2967                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 2968             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 2969                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
 2970                 return
 2971             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 2972             try:
 2973                 info['title'] = mdoc.findall('./title')[0].text
 2974                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 2975             except IndexError:
 2976                 self._downloader.report_error(u'Invalid metadata XML file')
 2977                 return
 2978             info['ext'] = info['url'].rpartition('.')[2]
 2979             return [info]
 2980         elif mobj.group('course'): # A course page
 2981             course = mobj.group('course')
 2982             info = {
 2983                 'id': course,
 2984                 'type': 'playlist',
 2985                 'uploader': None,
 2986                 'upload_date': None,
 2987             }
 2988 
 2989             coursepage = self._download_webpage(url, info['id'],
 2990                                         note='Downloading course info page',
 2991                                         errnote='Unable to download course info page')
 2992 
 2993             m = re.search('<h1>([^<]+)</h1>', coursepage)
 2994             if m:
 2995                 info['title'] = unescapeHTML(m.group(1))
 2996             else:
 2997                 info['title'] = info['id']
 2998 
 2999             m = re.search('<description>([^<]+)</description>', coursepage)
 3000             if m:
 3001                 info['description'] = unescapeHTML(m.group(1))
 3002 
 3003             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 3004             info['list'] = [
 3005                 {
 3006                     'type': 'reference',
 3007                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 3008                 }
 3009                     for vpage in links]
 3010             results = []
 3011             for entry in info['list']:
 3012                 assert entry['type'] == 'reference'
 3013                 results += self.extract(entry['url'])
 3014             return results
 3015         else: # Root page
 3016             info = {
 3017                 'id': 'Stanford OpenClassroom',
 3018                 'type': 'playlist',
 3019                 'uploader': None,
 3020                 'upload_date': None,
 3021             }
 3022 
 3023             self.report_download_webpage(info['id'])
 3024             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 3025             try:
 3026                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 3027             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 3028                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
 3029                 return
 3030 
 3031             info['title'] = info['id']
 3032 
 3033             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 3034             info['list'] = [
 3035                 {
 3036                     'type': 'reference',
 3037                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 3038                 }
 3039                     for cpage in links]
 3040 
 3041             results = []
 3042             for entry in info['list']:
 3043                 assert entry['type'] == 'reference'
 3044                 results += self.extract(entry['url'])
 3045             return results
 3046 
 3047 class MTVIE(InfoExtractor):
 3048     """Information extractor for MTV.com"""
 3049 
 3050     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 3051     IE_NAME = u'mtv'
 3052 
 3053     def _real_extract(self, url):
 3054         mobj = re.match(self._VALID_URL, url)
 3055         if mobj is None:
 3056             self._downloader.report_error(u'invalid URL: %s' % url)
 3057             return
 3058         if not mobj.group('proto'):
 3059             url = 'http://' + url
 3060         video_id = mobj.group('videoid')
 3061 
 3062         webpage = self._download_webpage(url, video_id)
 3063 
 3064         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
 3065         if mobj is None:
 3066             self._downloader.report_error(u'unable to extract song name')
 3067             return
 3068         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
 3069         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
 3070         if mobj is None:
 3071             self._downloader.report_error(u'unable to extract performer')
 3072             return
 3073         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
 3074         video_title = performer + ' - ' + song_name
 3075 
 3076         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
 3077         if mobj is None:
 3078             self._downloader.report_error(u'unable to mtvn_uri')
 3079             return
 3080         mtvn_uri = mobj.group(1)
 3081 
 3082         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
 3083         if mobj is None:
 3084             self._downloader.report_error(u'unable to extract content id')
 3085             return
 3086         content_id = mobj.group(1)
 3087 
 3088         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 3089         self.report_extraction(video_id)
 3090         request = compat_urllib_request.Request(videogen_url)
 3091         try:
 3092             metadataXml = compat_urllib_request.urlopen(request).read()
 3093         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 3094             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
 3095             return
 3096 
 3097         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 3098         renditions = mdoc.findall('.//rendition')
 3099 
 3100         # For now, always pick the highest quality.
 3101         rendition = renditions[-1]
 3102 
 3103         try:
 3104             _,_,ext = rendition.attrib['type'].partition('/')
 3105             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 3106             video_url = rendition.find('./src').text
 3107         except KeyError:
 3108             self._downloader.report_error('Invalid rendition field.')
 3109             return
 3110 
 3111         info = {
 3112             'id': video_id,
 3113             'url': video_url,
 3114             'uploader': performer,
 3115             'upload_date': None,
 3116             'title': video_title,
 3117             'ext': ext,
 3118             'format': format,
 3119         }
 3120 
 3121         return [info]
 3122 
 3123 
 3124 class YoukuIE(InfoExtractor):
 3125     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 3126 
 3127     def _gen_sid(self):
 3128         nowTime = int(time.time() * 1000)
 3129         random1 = random.randint(1000,1998)
 3130         random2 = random.randint(1000,9999)
 3131 
 3132         return "%d%d%d" %(nowTime,random1,random2)
 3133 
 3134     def _get_file_ID_mix_string(self, seed):
 3135         mixed = []
 3136         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 3137         seed = float(seed)
 3138         for i in range(len(source)):
 3139             seed  =  (seed * 211 + 30031 ) % 65536
 3140             index  =  math.floor(seed / 65536 * len(source) )
 3141             mixed.append(source[int(index)])
 3142             source.remove(source[int(index)])
 3143         #return ''.join(mixed)
 3144         return mixed
 3145 
 3146     def _get_file_id(self, fileId, seed):
 3147         mixed = self._get_file_ID_mix_string(seed)
 3148         ids = fileId.split('*')
 3149         realId = []
 3150         for ch in ids:
 3151             if ch:
 3152                 realId.append(mixed[int(ch)])
 3153         return ''.join(realId)
 3154 
 3155     def _real_extract(self, url):
 3156         mobj = re.match(self._VALID_URL, url)
 3157         if mobj is None:
 3158             self._downloader.report_error(u'invalid URL: %s' % url)
 3159             return
 3160         video_id = mobj.group('ID')
 3161 
 3162         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 3163 
 3164         request = compat_urllib_request.Request(info_url, None, std_headers)
 3165         try:
 3166             self.report_download_webpage(video_id)
 3167             jsondata = compat_urllib_request.urlopen(request).read()
 3168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 3169             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 3170             return
 3171 
 3172         self.report_extraction(video_id)
 3173         try:
 3174             jsonstr = jsondata.decode('utf-8')
 3175             config = json.loads(jsonstr)
 3176 
 3177             video_title =  config['data'][0]['title']
 3178             seed = config['data'][0]['seed']
 3179 
 3180             format = self._downloader.params.get('format', None)
 3181             supported_format = list(config['data'][0]['streamfileids'].keys())
 3182 
 3183             if format is None or format == 'best':
 3184                 if 'hd2' in supported_format:
 3185                     format = 'hd2'
 3186                 else:
 3187                     format = 'flv'
 3188                 ext = u'flv'
 3189             elif format == 'worst':
 3190                 format = 'mp4'
 3191                 ext = u'mp4'
 3192             else:
 3193                 format = 'flv'
 3194                 ext = u'flv'
 3195 
 3196 
 3197             fileid = config['data'][0]['streamfileids'][format]
 3198             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 3199         except (UnicodeDecodeError, ValueError, KeyError):
 3200             self._downloader.report_error(u'unable to extract info section')
 3201             return
 3202 
 3203         files_info=[]
 3204         sid = self._gen_sid()
 3205         fileid = self._get_file_id(fileid, seed)
 3206 
 3207         #column 8,9 of fileid represent the segment number
 3208         #fileid[7:9] should be changed
 3209         for index, key in enumerate(keys):
 3210 
 3211             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 3212             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 3213 
 3214             info = {
 3215                 'id': '%s_part%02d' % (video_id, index),
 3216                 'url': download_url,
 3217                 'uploader': None,
 3218                 'upload_date': None,
 3219                 'title': video_title,
 3220                 'ext': ext,
 3221             }
 3222             files_info.append(info)
 3223 
 3224         return files_info
 3225 
 3226 
 3227 class XNXXIE(InfoExtractor):
 3228     """Information extractor for xnxx.com"""
 3229 
 3230     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 3231     IE_NAME = u'xnxx'
 3232     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 3233     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 3234     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 3235 
 3236     def _real_extract(self, url):
 3237         mobj = re.match(self._VALID_URL, url)
 3238         if mobj is None:
 3239             self._downloader.report_error(u'invalid URL: %s' % url)
 3240             return
 3241         video_id = mobj.group(1)
 3242 
 3243         self.report_download_webpage(video_id)
 3244 
 3245         # Get webpage content
 3246         try:
 3247             webpage_bytes = compat_urllib_request.urlopen(url).read()
 3248             webpage = webpage_bytes.decode('utf-8')
 3249         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 3250             self._downloader.report_error(u'unable to download video webpage: %s' % err)
 3251             return
 3252 
 3253         result = re.search(self.VIDEO_URL_RE, webpage)
 3254         if result is None:
 3255             self._downloader.report_error(u'unable to extract video url')
 3256             return
 3257         video_url = compat_urllib_parse.unquote(result.group(1))
 3258 
 3259         result = re.search(self.VIDEO_TITLE_RE, webpage)
 3260         if result is None:
 3261             self._downloader.report_error(u'unable to extract video title')
 3262             return
 3263         video_title = result.group(1)
 3264 
 3265         result = re.search(self.VIDEO_THUMB_RE, webpage)
 3266         if result is None:
 3267             self._downloader.report_error(u'unable to extract video thumbnail')
 3268             return
 3269         video_thumbnail = result.group(1)
 3270 
 3271         return [{
 3272             'id': video_id,
 3273             'url': video_url,
 3274             'uploader': None,
 3275             'upload_date': None,
 3276             'title': video_title,
 3277             'ext': 'flv',
 3278             'thumbnail': video_thumbnail,
 3279             'description': None,
 3280         }]
 3281 
 3282 
 3283 class GooglePlusIE(InfoExtractor):
 3284     """Information extractor for plus.google.com."""
 3285 
 3286     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
 3287     IE_NAME = u'plus.google'
 3288 
 3289     def report_extract_entry(self, url):
 3290         """Report downloading extry"""
 3291         self.to_screen(u'Downloading entry: %s' % url)
 3292 
 3293     def report_date(self, upload_date):
 3294         """Report downloading extry"""
 3295         self.to_screen(u'Entry date: %s' % upload_date)
 3296 
 3297     def report_uploader(self, uploader):
 3298         """Report downloading extry"""
 3299         self.to_screen(u'Uploader: %s' % uploader)
 3300 
 3301     def report_title(self, video_title):
 3302         """Report downloading extry"""
 3303         self.to_screen(u'Title: %s' % video_title)
 3304 
 3305     def report_extract_vid_page(self, video_page):
 3306         """Report information extraction."""
 3307         self.to_screen(u'Extracting video page: %s' % video_page)
 3308 
 3309     def _real_extract(self, url):
 3310         # Extract id from URL
 3311         mobj = re.match(self._VALID_URL, url)
 3312         if mobj is None:
 3313             self._downloader.report_error(u'Invalid URL: %s' % url)
 3314             return
 3315 
 3316         post_url = mobj.group(0)
 3317         video_id = mobj.group(1)
 3318 
 3319         video_extension = 'flv'
 3320 
 3321         # Step 1, Retrieve post webpage to extract further information
 3322         self.report_extract_entry(post_url)
 3323         request = compat_urllib_request.Request(post_url)
 3324         try:
 3325             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
 3326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 3327             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
 3328             return
 3329 
 3330         # Extract update date
 3331         upload_date = None
 3332         pattern = 'title="Timestamp">(.*?)</a>'
 3333         mobj = re.search(pattern, webpage)
 3334         if mobj:
 3335             upload_date = mobj.group(1)
 3336             # Convert timestring to a format suitable for filename
 3337             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
 3338             upload_date = upload_date.strftime('%Y%m%d')
 3339         self.report_date(upload_date)
 3340 
 3341         # Extract uploader
 3342         uploader = None
 3343         pattern = r'rel\="author".*?>(.*?)</a>'
 3344         mobj = re.search(pattern, webpage)
 3345         if mobj:
 3346             uploader = mobj.group(1)
 3347         self.report_uploader(uploader)
 3348 
 3349         # Extract title
 3350         # Get the first line for title
 3351         video_title = u'NA'
 3352         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
 3353         mobj = re.search(pattern, webpage)
 3354         if mobj:
 3355             video_title = mobj.group(1)
 3356         self.report_title(video_title)
 3357 
 3358         # Step 2, Stimulate clicking the image box to launch video
 3359         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
 3360         mobj = re.search(pattern, webpage)
 3361         if mobj is None:
 3362             self._downloader.report_error(u'unable to extract video page URL')
 3363 
 3364         video_page = mobj.group(1)
 3365         request = compat_urllib_request.Request(video_page)
 3366         try:
 3367             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
 3368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 3369             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 3370             return
 3371         self.report_extract_vid_page(video_page)
 3372 
 3373 
 3374         # Extract video links on video page
 3375         """Extract video links of all sizes"""
 3376         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
 3377         mobj = re.findall(pattern, webpage)
 3378         if len(mobj) == 0:
 3379             self._downloader.report_error(u'unable to extract video links')
 3380 
 3381         # Sort in resolution
 3382         links = sorted(mobj)
 3383 
 3384         # Choose the lowest of the sort, i.e. highest resolution
 3385         video_url = links[-1]
 3386         # Only get the url. The resolution part in the tuple has no use anymore
 3387         video_url = video_url[-1]
 3388         # Treat escaped \u0026 style hex
 3389         try:
 3390             video_url = video_url.decode("unicode_escape")
 3391         except AttributeError: # Python 3
 3392             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
 3393 
 3394 
 3395         return [{
 3396             'id':       video_id,
 3397             'url':      video_url,
 3398             'uploader': uploader,
 3399             'upload_date':  upload_date,
 3400             'title':    video_title,
 3401             'ext':      video_extension,
 3402         }]
 3403 
 3404 class NBAIE(InfoExtractor):
 3405     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
 3406     IE_NAME = u'nba'
 3407 
 3408     def _real_extract(self, url):
 3409         mobj = re.match(self._VALID_URL, url)
 3410         if mobj is None:
 3411             self._downloader.report_error(u'invalid URL: %s' % url)
 3412             return
 3413 
 3414         video_id = mobj.group(1)
 3415         if video_id.endswith('/index.html'):
 3416             video_id = video_id[:-len('/index.html')]
 3417 
 3418         webpage = self._download_webpage(url, video_id)
 3419 
 3420         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 3421         def _findProp(rexp, default=None):
 3422             m = re.search(rexp, webpage)
 3423             if m:
 3424                 return unescapeHTML(m.group(1))
 3425             else:
 3426                 return default
 3427 
 3428         shortened_video_id = video_id.rpartition('/')[2]
 3429         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
 3430         info = {
 3431             'id': shortened_video_id,
 3432             'url': video_url,
 3433             'ext': 'mp4',
 3434             'title': title,
 3435             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
 3436             'description': _findProp(r'<div class="description">(.*?)</h1>'),
 3437         }
 3438         return [info]
 3439 
 3440 class JustinTVIE(InfoExtractor):
 3441     """Information extractor for justin.tv and twitch.tv"""
 3442     # TODO: One broadcast may be split into multiple videos. The key
 3443     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 3444     # starts at 1 and increases. Can we treat all parts as one video?
 3445 
 3446     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 3447         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
 3448     _JUSTIN_PAGE_LIMIT = 100
 3449     IE_NAME = u'justin.tv'
 3450 
 3451     def report_download_page(self, channel, offset):
 3452         """Report attempt to download a single page of videos."""
 3453         self.to_screen(u'%s: Downloading video information from %d to %d' %
 3454                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 3455 
 3456     # Return count of items, list of *valid* items
 3457     def _parse_page(self, url):
 3458         try:
 3459             urlh = compat_urllib_request.urlopen(url)
 3460             webpage_bytes = urlh.read()
 3461             webpage = webpage_bytes.decode('utf-8', 'ignore')
 3462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 3463             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
 3464             return
 3465 
 3466         response = json.loads(webpage)
 3467         if type(response) != list:
 3468             error_text = response.get('error', 'unknown error')
 3469             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
 3470             return
 3471         info = []
 3472         for clip in response:
 3473             video_url = clip['video_file_url']
 3474             if video_url:
 3475                 video_extension = os.path.splitext(video_url)[1][1:]
 3476                 video_date = re.sub('-', '', clip['start_time'][:10])
 3477                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 3478                 video_id = clip['id']
 3479                 video_title = clip.get('title', video_id)
 3480                 info.append({
 3481                     'id': video_id,
 3482                     'url': video_url,
 3483                     'title': video_title,
 3484                     'uploader': clip.get('channel_name', video_uploader_id),
 3485                     'uploader_id': video_uploader_id,
 3486                     'upload_date': video_date,
 3487                     'ext': video_extension,
 3488                 })
 3489         return (len(response), info)
 3490 
 3491     def _real_extract(self, url):
 3492         mobj = re.match(self._VALID_URL, url)
 3493         if mobj is None:
 3494             self._downloader.report_error(u'invalid URL: %s' % url)
 3495             return
 3496 
 3497         api = 'http://api.justin.tv'
 3498         video_id = mobj.group(mobj.lastindex)
 3499         paged = False
 3500         if mobj.lastindex == 1:
 3501             paged = True
 3502             api += '/channel/archives/%s.json'
 3503         else:
 3504             api += '/broadcast/by_archive/%s.json'
 3505         api = api % (video_id,)
 3506 
 3507         self.report_extraction(video_id)
 3508 
 3509         info = []
 3510         offset = 0
 3511         limit = self._JUSTIN_PAGE_LIMIT
 3512         while True:
 3513             if paged:
 3514                 self.report_download_page(video_id, offset)
 3515             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 3516             page_count, page_info = self._parse_page(page_url)
 3517             info.extend(page_info)
 3518             if not paged or page_count != limit:
 3519                 break
 3520             offset += limit
 3521         return info
 3522 
 3523 class FunnyOrDieIE(InfoExtractor):
 3524     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 3525 
 3526     def _real_extract(self, url):
 3527         mobj = re.match(self._VALID_URL, url)
 3528         if mobj is None:
 3529             self._downloader.report_error(u'invalid URL: %s' % url)
 3530             return
 3531 
 3532         video_id = mobj.group('id')
 3533         webpage = self._download_webpage(url, video_id)
 3534 
 3535         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
 3536         if not m:
 3537             self._downloader.report_error(u'unable to find video information')
 3538         video_url = unescapeHTML(m.group('url'))
 3539 
 3540         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
 3541         if not m:
 3542             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
 3543             if not m:
 3544                 self._downloader.report_error(u'Cannot find video title')
 3545         title = clean_html(m.group('title'))
 3546 
 3547         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
 3548         if m:
 3549             desc = unescapeHTML(m.group('desc'))
 3550         else:
 3551             desc = None
 3552 
 3553         info = {
 3554             'id': video_id,
 3555             'url': video_url,
 3556             'ext': 'mp4',
 3557             'title': title,
 3558             'description': desc,
 3559         }
 3560         return [info]
 3561 
 3562 class SteamIE(InfoExtractor):
 3563     _VALID_URL = r"""http://store.steampowered.com/
 3564                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 3565                 (?P<gameID>\d+)/?
 3566                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 3567                 """
 3568 
 3569     @classmethod
 3570     def suitable(cls, url):
 3571         """Receives a URL and returns True if suitable for this IE."""
 3572         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 3573 
 3574     def _real_extract(self, url):
 3575         m = re.match(self._VALID_URL, url, re.VERBOSE)
 3576         gameID = m.group('gameID')
 3577         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
 3578         self.report_age_confirmation()
 3579         webpage = self._download_webpage(videourl, gameID)
 3580         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
 3581         
 3582         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 3583         mweb = re.finditer(urlRE, webpage)
 3584         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 3585         titles = re.finditer(namesRE, webpage)
 3586         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 3587         thumbs = re.finditer(thumbsRE, webpage)
 3588         videos = []
 3589         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 3590             video_id = vid.group('videoID')
 3591             title = vtitle.group('videoName')
 3592             video_url = vid.group('videoURL')
 3593             video_thumb = thumb.group('thumbnail')
 3594             if not video_url:
 3595                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
 3596             info = {
 3597                 'id':video_id,
 3598                 'url':video_url,
 3599                 'ext': 'flv',
 3600                 'title': unescapeHTML(title),
 3601                 'thumbnail': video_thumb
 3602                   }
 3603             videos.append(info)
 3604         return [self.playlist_result(videos, gameID, game_title)]
 3605 
 3606 class UstreamIE(InfoExtractor):
 3607     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 3608     IE_NAME = u'ustream'
 3609 
 3610     def _real_extract(self, url):
 3611         m = re.match(self._VALID_URL, url)
 3612         video_id = m.group('videoID')
 3613         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 3614         webpage = self._download_webpage(url, video_id)
 3615         m = re.search(r'data-title="(?P<title>.+)"',webpage)
 3616         title = m.group('title')
 3617         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
 3618         uploader = m.group('uploader')
 3619         info = {
 3620                 'id':video_id,
 3621                 'url':video_url,
 3622                 'ext': 'flv',
 3623                 'title': title,
 3624                 'uploader': uploader
 3625                   }
 3626         return [info]
 3627 
 3628 class WorldStarHipHopIE(InfoExtractor):
 3629     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 3630     IE_NAME = u'WorldStarHipHop'
 3631 
 3632     def _real_extract(self, url):
 3633         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
 3634 
 3635         webpage_src = compat_urllib_request.urlopen(url).read()
 3636         webpage_src = webpage_src.decode('utf-8')
 3637 
 3638         mobj = re.search(_src_url, webpage_src)
 3639 
 3640         m = re.match(self._VALID_URL, url)
 3641         video_id = m.group('id')
 3642 
 3643         if mobj is not None:
 3644             video_url = mobj.group()
 3645             if 'mp4' in video_url:
 3646                 ext = 'mp4'
 3647             else:
 3648                 ext = 'flv'
 3649         else:
 3650             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
 3651             return
 3652 
 3653         _title = r"""<title>(.*)</title>"""
 3654 
 3655         mobj = re.search(_title, webpage_src)
 3656 
 3657         if mobj is not None:
 3658             title = mobj.group(1)
 3659         else:
 3660             title = 'World Start Hip Hop - %s' % time.ctime()
 3661 
 3662         _thumbnail = r"""rel="image_src" href="(.*)" />"""
 3663         mobj = re.search(_thumbnail, webpage_src)
 3664 
 3665         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 3666         if mobj is not None:
 3667             thumbnail = mobj.group(1)
 3668         else:
 3669             _title = r"""candytitles.*>(.*)</span>"""
 3670             mobj = re.search(_title, webpage_src)
 3671             if mobj is not None:
 3672                 title = mobj.group(1)
 3673             thumbnail = None
 3674 
 3675         results = [{
 3676                     'id': video_id,
 3677                     'url' : video_url,
 3678                     'title' : title,
 3679                     'thumbnail' : thumbnail,
 3680                     'ext' : ext,
 3681                     }]
 3682         return results
 3683 
 3684 class RBMARadioIE(InfoExtractor):
 3685     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 3686 
 3687     def _real_extract(self, url):
 3688         m = re.match(self._VALID_URL, url)
 3689         video_id = m.group('videoID')
 3690 
 3691         webpage = self._download_webpage(url, video_id)
 3692         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
 3693         if not m:
 3694             raise ExtractorError(u'Cannot find metadata')
 3695         json_data = m.group(1)
 3696 
 3697         try:
 3698             data = json.loads(json_data)
 3699         except ValueError as e:
 3700             raise ExtractorError(u'Invalid JSON: ' + str(e))
 3701 
 3702         video_url = data['akamai_url'] + '&cbr=256'
 3703         url_parts = compat_urllib_parse_urlparse(video_url)
 3704         video_ext = url_parts.path.rpartition('.')[2]
 3705         info = {
 3706                 'id': video_id,
 3707                 'url': video_url,
 3708                 'ext': video_ext,
 3709                 'title': data['title'],
 3710                 'description': data.get('teaser_text'),
 3711                 'location': data.get('country_of_origin'),
 3712                 'uploader': data.get('host', {}).get('name'),
 3713                 'uploader_id': data.get('host', {}).get('slug'),
 3714                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 3715                 'duration': data.get('duration'),
 3716         }
 3717         return [info]
 3718 
 3719 
 3720 class YouPornIE(InfoExtractor):
 3721     """Information extractor for youporn.com."""
 3722     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 3723 
 3724     def _print_formats(self, formats):
 3725         """Print all available formats"""
 3726         print(u'Available formats:')
 3727         print(u'ext\t\tformat')
 3728         print(u'---------------------------------')
 3729         for format in formats:
 3730             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 3731 
 3732     def _specific(self, req_format, formats):
 3733         for x in formats:
 3734             if(x["format"]==req_format):
 3735                 return x
 3736         return None
 3737 
 3738     def _real_extract(self, url):
 3739         mobj = re.match(self._VALID_URL, url)
 3740         if mobj is None:
 3741             self._downloader.report_error(u'invalid URL: %s' % url)
 3742             return
 3743 
 3744         video_id = mobj.group('videoid')
 3745 
 3746         req = compat_urllib_request.Request(url)
 3747         req.add_header('Cookie', 'age_verified=1')
 3748         webpage = self._download_webpage(req, video_id)
 3749 
 3750         # Get the video title
 3751         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
 3752         if result is None:
 3753             raise ExtractorError(u'Unable to extract video title')
 3754         video_title = result.group('title').strip()
 3755 
 3756         # Get the video date
 3757         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
 3758         if result is None:
 3759             self._downloader.report_warning(u'unable to extract video date')
 3760             upload_date = None
 3761         else:
 3762             upload_date = result.group('date').strip()
 3763 
 3764         # Get the video uploader
 3765         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
 3766         if result is None:
 3767             self._downloader.report_warning(u'unable to extract uploader')
 3768             video_uploader = None
 3769         else:
 3770             video_uploader = result.group('uploader').strip()
 3771             video_uploader = clean_html( video_uploader )
 3772 
 3773         # Get all of the formats available
 3774         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 3775         result = re.search(DOWNLOAD_LIST_RE, webpage)
 3776         if result is None:
 3777             raise ExtractorError(u'Unable to extract download list')
 3778         download_list_html = result.group('download_list').strip()
 3779 
 3780         # Get all of the links from the page
 3781         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 3782         links = re.findall(LINK_RE, download_list_html)
 3783         if(len(links) == 0):
 3784             raise ExtractorError(u'ERROR: no known formats available for video')
 3785 
 3786         self.to_screen(u'Links found: %d' % len(links))
 3787 
 3788         formats = []
 3789         for link in links:
 3790 
 3791             # A link looks like this:
 3792             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 3793             # A path looks like this:
 3794             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 3795             video_url = unescapeHTML( link )
 3796             path = compat_urllib_parse_urlparse( video_url ).path
 3797             extension = os.path.splitext( path )[1][1:]
 3798             format = path.split('/')[4].split('_')[:2]
 3799             size = format[0]
 3800             bitrate = format[1]
 3801             format = "-".join( format )
 3802             title = u'%s-%s-%s' % (video_title, size, bitrate)
 3803 
 3804             formats.append({
 3805                 'id': video_id,
 3806                 'url': video_url,
 3807                 'uploader': video_uploader,
 3808                 'upload_date': upload_date,
 3809                 'title': title,
 3810                 'ext': extension,
 3811                 'format': format,
 3812                 'thumbnail': None,
 3813                 'description': None,
 3814                 'player_url': None
 3815             })
 3816 
 3817         if self._downloader.params.get('listformats', None):
 3818             self._print_formats(formats)
 3819             return
 3820 
 3821         req_format = self._downloader.params.get('format', None)
 3822         self.to_screen(u'Format: %s' % req_format)
 3823 
 3824         if req_format is None or req_format == 'best':
 3825             return [formats[0]]
 3826         elif req_format == 'worst':
 3827             return [formats[-1]]
 3828         elif req_format in ('-1', 'all'):
 3829             return formats
 3830         else:
 3831             format = self._specific( req_format, formats )
 3832             if result is None:
 3833                 self._downloader.report_error(u'requested format not available')
 3834                 return
 3835             return [format]
 3836 
 3837 
 3838 
 3839 class PornotubeIE(InfoExtractor):
 3840     """Information extractor for pornotube.com."""
 3841     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 3842 
 3843     def _real_extract(self, url):
 3844         mobj = re.match(self._VALID_URL, url)
 3845         if mobj is None:
 3846             self._downloader.report_error(u'invalid URL: %s' % url)
 3847             return
 3848 
 3849         video_id = mobj.group('videoid')
 3850         video_title = mobj.group('title')
 3851 
 3852         # Get webpage content
 3853         webpage = self._download_webpage(url, video_id)
 3854 
 3855         # Get the video URL
 3856         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 3857         result = re.search(VIDEO_URL_RE, webpage)
 3858         if result is None:
 3859             self._downloader.report_error(u'unable to extract video url')
 3860             return
 3861         video_url = compat_urllib_parse.unquote(result.group('url'))
 3862 
 3863         #Get the uploaded date
 3864         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 3865         result = re.search(VIDEO_UPLOADED_RE, webpage)
 3866         if result is None:
 3867             self._downloader.report_error(u'unable to extract video title')
 3868             return
 3869         upload_date = result.group('date')
 3870 
 3871         info = {'id': video_id,
 3872                 'url': video_url,
 3873                 'uploader': None,
 3874                 'upload_date': upload_date,
 3875                 'title': video_title,
 3876                 'ext': 'flv',
 3877                 'format': 'flv'}
 3878 
 3879         return [info]
 3880 
 3881 class YouJizzIE(InfoExtractor):
 3882     """Information extractor for youjizz.com."""
 3883     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 3884 
 3885     def _real_extract(self, url):
 3886         mobj = re.match(self._VALID_URL, url)
 3887         if mobj is None:
 3888             self._downloader.report_error(u'invalid URL: %s' % url)
 3889             return
 3890 
 3891         video_id = mobj.group('videoid')
 3892 
 3893         # Get webpage content
 3894         webpage = self._download_webpage(url, video_id)
 3895 
 3896         # Get the video title
 3897         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
 3898         if result is None:
 3899             raise ExtractorError(u'ERROR: unable to extract video title')
 3900         video_title = result.group('title').strip()
 3901 
 3902         # Get the embed page
 3903         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 3904         if result is None:
 3905             raise ExtractorError(u'ERROR: unable to extract embed page')
 3906 
 3907         embed_page_url = result.group(0).strip()
 3908         video_id = result.group('videoid')
 3909 
 3910         webpage = self._download_webpage(embed_page_url, video_id)
 3911 
 3912         # Get the video URL
 3913         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
 3914         if result is None:
 3915             raise ExtractorError(u'ERROR: unable to extract video url')
 3916         video_url = result.group('source')
 3917 
 3918         info = {'id': video_id,
 3919                 'url': video_url,
 3920                 'title': video_title,
 3921                 'ext': 'flv',
 3922                 'format': 'flv',
 3923                 'player_url': embed_page_url}
 3924 
 3925         return [info]
 3926 
 3927 class EightTracksIE(InfoExtractor):
 3928     IE_NAME = '8tracks'
 3929     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 3930 
 3931     def _real_extract(self, url):
 3932         mobj = re.match(self._VALID_URL, url)
 3933         if mobj is None:
 3934             raise ExtractorError(u'Invalid URL: %s' % url)
 3935         playlist_id = mobj.group('id')
 3936 
 3937         webpage = self._download_webpage(url, playlist_id)
 3938 
 3939         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
 3940         if not m:
 3941             raise ExtractorError(u'Cannot find trax information')
 3942         json_like = m.group(1)
 3943         data = json.loads(json_like)
 3944 
 3945         session = str(random.randint(0, 1000000000))
 3946         mix_id = data['id']
 3947         track_count = data['tracks_count']
 3948         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 3949         next_url = first_url
 3950         res = []
 3951         for i in itertools.count():
 3952             api_json = self._download_webpage(next_url, playlist_id,
 3953                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 3954                 errnote=u'Failed to download song information')
 3955             api_data = json.loads(api_json)
 3956             track_data = api_data[u'set']['track']
 3957             info = {
 3958                 'id': track_data['id'],
 3959                 'url': track_data['track_file_stream_url'],
 3960                 'title': track_data['performer'] + u' - ' + track_data['name'],
 3961                 'raw_title': track_data['name'],
 3962                 'uploader_id': data['user']['login'],
 3963                 'ext': 'm4a',
 3964             }
 3965             res.append(info)
 3966             if api_data['set']['at_last_track']:
 3967                 break
 3968             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 3969         return res
 3970 
 3971 class KeekIE(InfoExtractor):
 3972     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 3973     IE_NAME = u'keek'
 3974 
 3975     def _real_extract(self, url):
 3976         m = re.match(self._VALID_URL, url)
 3977         video_id = m.group('videoID')
 3978         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 3979         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 3980         webpage = self._download_webpage(url, video_id)
 3981         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
 3982         title = unescapeHTML(m.group('title'))
 3983         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
 3984         uploader = clean_html(m.group('uploader'))
 3985         info = {
 3986                 'id': video_id,
 3987                 'url': video_url,
 3988                 'ext': 'mp4',
 3989                 'title': title,
 3990                 'thumbnail': thumbnail,
 3991                 'uploader': uploader
 3992         }
 3993         return [info]
 3994 
 3995 class TEDIE(InfoExtractor):
 3996     _VALID_URL=r'''http://www.ted.com/
 3997                    (
 3998                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
 3999                         |
 4000                         ((?P<type_talk>talks)) # We have a simple talk
 4001                    )
 4002                    /(?P<name>\w+) # Here goes the name and then ".html"
 4003                    '''
 4004 
 4005     @classmethod
 4006     def suitable(cls, url):
 4007         """Receives a URL and returns True if suitable for this IE."""
 4008         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 4009 
 4010     def _real_extract(self, url):
 4011         m=re.match(self._VALID_URL, url, re.VERBOSE)
 4012         if m.group('type_talk'):
 4013             return [self._talk_info(url)]
 4014         else :
 4015             playlist_id=m.group('playlist_id')
 4016             name=m.group('name')
 4017             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
 4018             return [self._playlist_videos_info(url,name,playlist_id)]
 4019 
 4020     def _talk_video_link(self,mediaSlug):
 4021         '''Returns the video link for that mediaSlug'''
 4022         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
 4023 
 4024     def _playlist_videos_info(self,url,name,playlist_id=0):
 4025         '''Returns the videos of the playlist'''
 4026         video_RE=r'''
 4027                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
 4028                      ([.\s]*?)data-playlist_item_id="(\d+)"
 4029                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
 4030                      '''
 4031         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
 4032         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
 4033         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
 4034         m_names=re.finditer(video_name_RE,webpage)
 4035 
 4036         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
 4037         m_playlist = re.search(playlist_RE, webpage)
 4038         playlist_title = m_playlist.group('playlist_title')
 4039 
 4040         playlist_entries = []
 4041         for m_video, m_name in zip(m_videos,m_names):
 4042             video_id=m_video.group('video_id')
 4043             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
 4044             playlist_entries.append(self.url_result(talk_url, 'TED'))
 4045         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
 4046 
 4047     def _talk_info(self, url, video_id=0):
 4048         """Return the video for the talk in the url"""
 4049         m=re.match(self._VALID_URL, url,re.VERBOSE)
 4050         videoName=m.group('name')
 4051         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
 4052         # If the url includes the language we get the title translated
 4053         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
 4054         title=re.search(title_RE, webpage).group('title')
 4055         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
 4056                         "id":(?P<videoID>[\d]+).*?
 4057                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
 4058         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
 4059         thumb_match=re.search(thumb_RE,webpage)
 4060         info_match=re.search(info_RE,webpage,re.VERBOSE)
 4061         video_id=info_match.group('videoID')
 4062         mediaSlug=info_match.group('mediaSlug')
 4063         video_url=self._talk_video_link(mediaSlug)
 4064         info = {
 4065                 'id': video_id,
 4066                 'url': video_url,
 4067                 'ext': 'mp4',
 4068                 'title': title,
 4069                 'thumbnail': thumb_match.group('thumbnail')
 4070                 }
 4071         return info
 4072 
 4073 class MySpassIE(InfoExtractor):
 4074     _VALID_URL = r'http://www.myspass.de/.*'
 4075 
 4076     def _real_extract(self, url):
 4077         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 4078 
 4079         # video id is the last path element of the URL
 4080         # usually there is a trailing slash, so also try the second but last
 4081         url_path = compat_urllib_parse_urlparse(url).path
 4082         url_parent_path, video_id = os.path.split(url_path)
 4083         if not video_id:
 4084             _, video_id = os.path.split(url_parent_path)
 4085 
 4086         # get metadata
 4087         metadata_url = META_DATA_URL_TEMPLATE % video_id
 4088         metadata_text = self._download_webpage(metadata_url, video_id)
 4089         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 4090 
 4091         # extract values from metadata
 4092         url_flv_el = metadata.find('url_flv')
 4093         if url_flv_el is None:
 4094             self._downloader.report_error(u'unable to extract download url')
 4095             return
 4096         video_url = url_flv_el.text
 4097         extension = os.path.splitext(video_url)[1][1:]
 4098         title_el = metadata.find('title')
 4099         if title_el is None:
 4100             self._downloader.report_error(u'unable to extract title')
 4101             return
 4102         title = title_el.text
 4103         format_id_el = metadata.find('format_id')
 4104         if format_id_el is None:
 4105             format = ext
 4106         else:
 4107             format = format_id_el.text
 4108         description_el = metadata.find('description')
 4109         if description_el is not None:
 4110             description = description_el.text
 4111         else:
 4112             description = None
 4113         imagePreview_el = metadata.find('imagePreview')
 4114         if imagePreview_el is not None:
 4115             thumbnail = imagePreview_el.text
 4116         else:
 4117             thumbnail = None
 4118         info = {
 4119             'id': video_id,
 4120             'url': video_url,
 4121             'title': title,
 4122             'ext': extension,
 4123             'format': format,
 4124             'thumbnail': thumbnail,
 4125             'description': description
 4126         }
 4127         return [info]
 4128 
 4129 class SpiegelIE(InfoExtractor):
 4130     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 4131 
 4132     def _real_extract(self, url):
 4133         m = re.match(self._VALID_URL, url)
 4134         video_id = m.group('videoID')
 4135 
 4136         webpage = self._download_webpage(url, video_id)
 4137         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
 4138         if not m:
 4139             raise ExtractorError(u'Cannot find title')
 4140         video_title = unescapeHTML(m.group(1))
 4141 
 4142         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 4143         xml_code = self._download_webpage(xml_url, video_id,
 4144                     note=u'Downloading XML', errnote=u'Failed to download XML')
 4145 
 4146         idoc = xml.etree.ElementTree.fromstring(xml_code)
 4147         last_type = idoc[-1]
 4148         filename = last_type.findall('./filename')[0].text
 4149         duration = float(last_type.findall('./duration')[0].text)
 4150 
 4151         video_url = 'http://video2.spiegel.de/flash/' + filename
 4152         video_ext = filename.rpartition('.')[2]
 4153         info = {
 4154             'id': video_id,
 4155             'url': video_url,
 4156             'ext': video_ext,
 4157             'title': video_title,
 4158             'duration': duration,
 4159         }
 4160         return [info]
 4161 
 4162 class LiveLeakIE(InfoExtractor):
 4163 
 4164     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 4165     IE_NAME = u'liveleak'
 4166 
 4167     def _real_extract(self, url):
 4168         mobj = re.match(self._VALID_URL, url)
 4169         if mobj is None:
 4170             self._downloader.report_error(u'invalid URL: %s' % url)
 4171             return
 4172 
 4173         video_id = mobj.group('video_id')
 4174 
 4175         webpage = self._download_webpage(url, video_id)
 4176 
 4177         m = re.search(r'file: "(.*?)",', webpage)
 4178         if not m:
 4179             self._downloader.report_error(u'unable to find video url')
 4180             return
 4181         video_url = m.group(1)
 4182 
 4183         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
 4184         if not m:
 4185             self._downloader.report_error(u'Cannot find video title')
 4186         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
 4187 
 4188         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
 4189         if m:
 4190             desc = unescapeHTML(m.group('desc'))
 4191         else:
 4192             desc = None
 4193 
 4194         m = re.search(r'By:.*?(\w+)</a>', webpage)
 4195         if m:
 4196             uploader = clean_html(m.group(1))
 4197         else:
 4198             uploader = None
 4199 
 4200         info = {
 4201             'id':  video_id,
 4202             'url': video_url,
 4203             'ext': 'mp4',
 4204             'title': title,
 4205             'description': desc,
 4206             'uploader': uploader
 4207         }
 4208 
 4209         return [info]
 4210 
 4211 class ARDIE(InfoExtractor):
 4212     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
 4213     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
 4214     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
 4215 
 4216     def _real_extract(self, url):
 4217         # determine video id from url
 4218         m = re.match(self._VALID_URL, url)
 4219 
 4220         numid = re.search(r'documentId=([0-9]+)', url)
 4221         if numid:
 4222             video_id = numid.group(1)
 4223         else:
 4224             video_id = m.group('video_id')
 4225 
 4226         # determine title and media streams from webpage
 4227         html = self._download_webpage(url, video_id)
 4228         title = re.search(self._TITLE, html).group('title')
 4229         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
 4230         if not streams:
 4231             assert '"fsk"' in html
 4232             self._downloader.report_error(u'this video is only available after 8:00 pm')
 4233             return
 4234 
 4235         # choose default media type and highest quality for now
 4236         stream = max([s for s in streams if int(s["media_type"]) == 0],
 4237                      key=lambda s: int(s["quality"]))
 4238 
 4239         # there's two possibilities: RTMP stream or HTTP download
 4240         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
 4241         if stream['rtmp_url']:
 4242             self.to_screen(u'RTMP download detected')
 4243             assert stream['video_url'].startswith('mp4:')
 4244             info["url"] = stream["rtmp_url"]
 4245             info["play_path"] = stream['video_url']
 4246         else:
 4247             assert stream["video_url"].endswith('.mp4')
 4248             info["url"] = stream["video_url"]
 4249         return [info]
 4250 
 4251 class TumblrIE(InfoExtractor):
 4252     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 4253 
 4254     def _real_extract(self, url):
 4255         m_url = re.match(self._VALID_URL, url)
 4256         video_id = m_url.group('id')
 4257         blog = m_url.group('blog_name')
 4258 
 4259         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 4260         webpage = self._download_webpage(url, video_id)
 4261 
 4262         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 4263         video = re.search(re_video, webpage)
 4264         if video is None:
 4265             self.to_screen("No video founded")
 4266             return []
 4267         video_url = video.group('video_url')
 4268         ext = video.group('ext')
 4269 
 4270         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
 4271         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
 4272 
 4273         # The only place where you can get a title, it's not complete,
 4274         # but searching in other places doesn't work for all videos
 4275         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
 4276         title = unescapeHTML(re.search(re_title, webpage).group('title'))
 4277 
 4278         return [{'id': video_id,
 4279                  'url': video_url,
 4280                  'title': title,
 4281                  'thumbnail': thumb,
 4282                  'ext': ext
 4283                  }]
 4284 
 4285 
 4286 def gen_extractors():
 4287     """ Return a list of an instance of every supported extractor.
 4288     The order does matter; the first extractor matched is the one handling the URL.
 4289     """
 4290     return [
 4291         YoutubePlaylistIE(),
 4292         YoutubeChannelIE(),
 4293         YoutubeUserIE(),
 4294         YoutubeSearchIE(),
 4295         YoutubeIE(),
 4296         MetacafeIE(),
 4297         DailymotionIE(),
 4298         GoogleSearchIE(),
 4299         PhotobucketIE(),
 4300         YahooIE(),
 4301         YahooSearchIE(),
 4302         DepositFilesIE(),
 4303         FacebookIE(),
 4304         BlipTVUserIE(),
 4305         BlipTVIE(),
 4306         VimeoIE(),
 4307         MyVideoIE(),
 4308         ComedyCentralIE(),
 4309         EscapistIE(),
 4310         CollegeHumorIE(),
 4311         XVideosIE(),
 4312         SoundcloudSetIE(),
 4313         SoundcloudIE(),
 4314         InfoQIE(),
 4315         MixcloudIE(),
 4316         StanfordOpenClassroomIE(),
 4317         MTVIE(),
 4318         YoukuIE(),
 4319         XNXXIE(),
 4320         YouJizzIE(),
 4321         PornotubeIE(),
 4322         YouPornIE(),
 4323         GooglePlusIE(),
 4324         ArteTvIE(),
 4325         NBAIE(),
 4326         WorldStarHipHopIE(),
 4327         JustinTVIE(),
 4328         FunnyOrDieIE(),
 4329         SteamIE(),
 4330         UstreamIE(),
 4331         RBMARadioIE(),
 4332         EightTracksIE(),
 4333         KeekIE(),
 4334         TEDIE(),
 4335         MySpassIE(),
 4336         SpiegelIE(),
 4337         LiveLeakIE(),
 4338         ARDIE(),
 4339         TumblrIE(),
 4340         GenericIE()
 4341     ]
 4342 
 4343 def get_info_extractor(ie_name):
 4344     """Returns the info extractor class with the given ie_name"""
 4345     return globals()[ie_name+'IE']