summaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
blob: 6fa7c334ea9e7ebafbb7805768e55e4f43ca00df (plain)
    1 from __future__ import unicode_literals
    2 
    3 import base64
    4 import datetime
    5 import hashlib
    6 import json
    7 import netrc
    8 import os
    9 import re
   10 import socket
   11 import sys
   12 import time
   13 import math
   14 
   15 from ..compat import (
   16     compat_cookiejar,
   17     compat_cookies,
   18     compat_etree_fromstring,
   19     compat_getpass,
   20     compat_http_client,
   21     compat_os_name,
   22     compat_str,
   23     compat_urllib_error,
   24     compat_urllib_parse_unquote,
   25     compat_urllib_parse_urlencode,
   26     compat_urllib_request,
   27     compat_urlparse,
   28 )
   29 from ..downloader.f4m import remove_encrypted_media
   30 from ..utils import (
   31     NO_DEFAULT,
   32     age_restricted,
   33     base_url,
   34     bug_reports_message,
   35     clean_html,
   36     compiled_regex_type,
   37     determine_ext,
   38     error_to_compat_str,
   39     ExtractorError,
   40     fix_xml_ampersands,
   41     float_or_none,
   42     int_or_none,
   43     parse_iso8601,
   44     RegexNotFoundError,
   45     sanitize_filename,
   46     sanitized_Request,
   47     unescapeHTML,
   48     unified_strdate,
   49     unified_timestamp,
   50     url_basename,
   51     xpath_element,
   52     xpath_text,
   53     xpath_with_ns,
   54     determine_protocol,
   55     parse_duration,
   56     mimetype2ext,
   57     update_Request,
   58     update_url_query,
   59     parse_m3u8_attributes,
   60     extract_attributes,
   61     parse_codecs,
   62     urljoin,
   63 )
   64 
   65 
   66 class InfoExtractor(object):
   67     """Information Extractor class.
   68 
   69     Information extractors are the classes that, given a URL, extract
   70     information about the video (or videos) the URL refers to. This
   71     information includes the real video URL, the video title, author and
   72     others. The information is stored in a dictionary which is then
   73     passed to the YoutubeDL. The YoutubeDL processes this
   74     information possibly downloading the video to the file system, among
   75     other possible outcomes.
   76 
   77     The type field determines the type of the result.
   78     By far the most common value (and the default if _type is missing) is
   79     "video", which indicates a single video.
   80 
   81     For a video, the dictionaries must include the following fields:
   82 
   83     id:             Video identifier.
   84     title:          Video title, unescaped.
   85 
   86     Additionally, it must contain either a formats entry or a url one:
   87 
   88     formats:        A list of dictionaries for each format available, ordered
   89                     from worst to best quality.
   90 
   91                     Potential fields:
   92                     * url        Mandatory. The URL of the video file
   93                     * manifest_url
   94                                  The URL of the manifest file in case of
   95                                  fragmented media (DASH, hls, hds)
   96                     * ext        Will be calculated from URL if missing
   97                     * format     A human-readable description of the format
   98                                  ("mp4 container with h264/opus").
   99                                  Calculated from the format_id, width, height.
  100                                  and format_note fields if missing.
  101                     * format_id  A short description of the format
  102                                  ("mp4_h264_opus" or "19").
  103                                 Technically optional, but strongly recommended.
  104                     * format_note Additional info about the format
  105                                  ("3D" or "DASH video")
  106                     * width      Width of the video, if known
  107                     * height     Height of the video, if known
  108                     * resolution Textual description of width and height
  109                     * tbr        Average bitrate of audio and video in KBit/s
  110                     * abr        Average audio bitrate in KBit/s
  111                     * acodec     Name of the audio codec in use
  112                     * asr        Audio sampling rate in Hertz
  113                     * vbr        Average video bitrate in KBit/s
  114                     * fps        Frame rate
  115                     * vcodec     Name of the video codec in use
  116                     * container  Name of the container format
  117                     * filesize   The number of bytes, if known in advance
  118                     * filesize_approx  An estimate for the number of bytes
  119                     * player_url SWF Player URL (used for rtmpdump).
  120                     * protocol   The protocol that will be used for the actual
  121                                  download, lower-case.
  122                                  "http", "https", "rtsp", "rtmp", "rtmpe",
  123                                  "m3u8", "m3u8_native" or "http_dash_segments".
  124                     * fragments  A list of fragments of the fragmented media,
  125                                  with the following entries:
  126                                  * "url" (mandatory) - fragment's URL
  127                                  * "duration" (optional, int or float)
  128                                  * "filesize" (optional, int)
  129                     * preference Order number of this format. If this field is
  130                                  present and not None, the formats get sorted
  131                                  by this field, regardless of all other values.
  132                                  -1 for default (order by other properties),
  133                                  -2 or smaller for less than default.
  134                                  < -1000 to hide the format (if there is
  135                                     another one which is strictly better)
  136                     * language   Language code, e.g. "de" or "en-US".
  137                     * language_preference  Is this in the language mentioned in
  138                                  the URL?
  139                                  10 if it's what the URL is about,
  140                                  -1 for default (don't know),
  141                                  -10 otherwise, other values reserved for now.
  142                     * quality    Order number of the video quality of this
  143                                  format, irrespective of the file format.
  144                                  -1 for default (order by other properties),
  145                                  -2 or smaller for less than default.
  146                     * source_preference  Order number for this video source
  147                                   (quality takes higher priority)
  148                                  -1 for default (order by other properties),
  149                                  -2 or smaller for less than default.
  150                     * http_headers  A dictionary of additional HTTP headers
  151                                  to add to the request.
  152                     * stretched_ratio  If given and not 1, indicates that the
  153                                  video's pixels are not square.
  154                                  width : height ratio as float.
  155                     * no_resume  The server does not support resuming the
  156                                  (HTTP or RTMP) download. Boolean.
  157 
  158     url:            Final video URL.
  159     ext:            Video filename extension.
  160     format:         The video format, defaults to ext (used for --get-format)
  161     player_url:     SWF Player URL (used for rtmpdump).
  162 
  163     The following fields are optional:
  164 
  165     alt_title:      A secondary title of the video.
  166     display_id      An alternative identifier for the video, not necessarily
  167                     unique, but available before title. Typically, id is
  168                     something like "4234987", title "Dancing naked mole rats",
  169                     and display_id "dancing-naked-mole-rats"
  170     thumbnails:     A list of dictionaries, with the following entries:
  171                         * "id" (optional, string) - Thumbnail format ID
  172                         * "url"
  173                         * "preference" (optional, int) - quality of the image
  174                         * "width" (optional, int)
  175                         * "height" (optional, int)
  176                         * "resolution" (optional, string "{width}x{height"},
  177                                         deprecated)
  178                         * "filesize" (optional, int)
  179     thumbnail:      Full URL to a video thumbnail image.
  180     description:    Full video description.
  181     uploader:       Full name of the video uploader.
  182     license:        License name the video is licensed under.
  183     creator:        The creator of the video.
  184     release_date:   The date (YYYYMMDD) when the video was released.
  185     timestamp:      UNIX timestamp of the moment the video became available.
  186     upload_date:    Video upload date (YYYYMMDD).
  187                     If not explicitly set, calculated from timestamp.
  188     uploader_id:    Nickname or id of the video uploader.
  189     uploader_url:   Full URL to a personal webpage of the video uploader.
  190     location:       Physical location where the video was filmed.
  191     subtitles:      The available subtitles as a dictionary in the format
  192                     {tag: subformats}. "tag" is usually a language code, and
  193                     "subformats" is a list sorted from lower to higher
  194                     preference, each element is a dictionary with the "ext"
  195                     entry and one of:
  196                         * "data": The subtitles file contents
  197                         * "url": A URL pointing to the subtitles file
  198                     "ext" will be calculated from URL if missing
  199     automatic_captions: Like 'subtitles', used by the YoutubeIE for
  200                     automatically generated captions
  201     duration:       Length of the video in seconds, as an integer or float.
  202     view_count:     How many users have watched the video on the platform.
  203     like_count:     Number of positive ratings of the video
  204     dislike_count:  Number of negative ratings of the video
  205     repost_count:   Number of reposts of the video
  206     average_rating: Average rating give by users, the scale used depends on the webpage
  207     comment_count:  Number of comments on the video
  208     comments:       A list of comments, each with one or more of the following
  209                     properties (all but one of text or html optional):
  210                         * "author" - human-readable name of the comment author
  211                         * "author_id" - user ID of the comment author
  212                         * "id" - Comment ID
  213                         * "html" - Comment as HTML
  214                         * "text" - Plain text of the comment
  215                         * "timestamp" - UNIX timestamp of comment
  216                         * "parent" - ID of the comment this one is replying to.
  217                                      Set to "root" to indicate that this is a
  218                                      comment to the original video.
  219     age_limit:      Age restriction for the video, as an integer (years)
  220     webpage_url:    The URL to the video webpage, if given to youtube-dl it
  221                     should allow to get the same result again. (It will be set
  222                     by YoutubeDL if it's missing)
  223     categories:     A list of categories that the video falls in, for example
  224                     ["Sports", "Berlin"]
  225     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
  226     is_live:        True, False, or None (=unknown). Whether this video is a
  227                     live stream that goes on instead of a fixed-length video.
  228     start_time:     Time in seconds where the reproduction should start, as
  229                     specified in the URL.
  230     end_time:       Time in seconds where the reproduction should end, as
  231                     specified in the URL.
  232 
  233     The following fields should only be used when the video belongs to some logical
  234     chapter or section:
  235 
  236     chapter:        Name or title of the chapter the video belongs to.
  237     chapter_number: Number of the chapter the video belongs to, as an integer.
  238     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
  239 
  240     The following fields should only be used when the video is an episode of some
  241     series, programme or podcast:
  242 
  243     series:         Title of the series or programme the video episode belongs to.
  244     season:         Title of the season the video episode belongs to.
  245     season_number:  Number of the season the video episode belongs to, as an integer.
  246     season_id:      Id of the season the video episode belongs to, as a unicode string.
  247     episode:        Title of the video episode. Unlike mandatory video title field,
  248                     this field should denote the exact title of the video episode
  249                     without any kind of decoration.
  250     episode_number: Number of the video episode within a season, as an integer.
  251     episode_id:     Id of the video episode, as a unicode string.
  252 
  253     The following fields should only be used when the media is a track or a part of
  254     a music album:
  255 
  256     track:          Title of the track.
  257     track_number:   Number of the track within an album or a disc, as an integer.
  258     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
  259                     as a unicode string.
  260     artist:         Artist(s) of the track.
  261     genre:          Genre(s) of the track.
  262     album:          Title of the album the track belongs to.
  263     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
  264     album_artist:   List of all artists appeared on the album (e.g.
  265                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
  266                     and compilations).
  267     disc_number:    Number of the disc or other physical medium the track belongs to,
  268                     as an integer.
  269     release_year:   Year (YYYY) when the album was released.
  270 
  271     Unless mentioned otherwise, the fields should be Unicode strings.
  272 
  273     Unless mentioned otherwise, None is equivalent to absence of information.
  274 
  275 
  276     _type "playlist" indicates multiple videos.
  277     There must be a key "entries", which is a list, an iterable, or a PagedList
  278     object, each element of which is a valid dictionary by this specification.
  279 
  280     Additionally, playlists can have "title", "description" and "id" attributes
  281     with the same semantics as videos (see above).
  282 
  283 
  284     _type "multi_video" indicates that there are multiple videos that
  285     form a single show, for examples multiple acts of an opera or TV episode.
  286     It must have an entries key like a playlist and contain all the keys
  287     required for a video at the same time.
  288 
  289 
  290     _type "url" indicates that the video must be extracted from another
  291     location, possibly by a different extractor. Its only required key is:
  292     "url" - the next URL to extract.
  293     The key "ie_key" can be set to the class name (minus the trailing "IE",
  294     e.g. "Youtube") if the extractor class is known in advance.
  295     Additionally, the dictionary may have any properties of the resolved entity
  296     known in advance, for example "title" if the title of the referred video is
  297     known ahead of time.
  298 
  299 
  300     _type "url_transparent" entities have the same specification as "url", but
  301     indicate that the given additional information is more precise than the one
  302     associated with the resolved URL.
  303     This is useful when a site employs a video service that hosts the video and
  304     its technical metadata, but that video service does not embed a useful
  305     title, description etc.
  306 
  307 
  308     Subclasses of this one should re-define the _real_initialize() and
  309     _real_extract() methods and define a _VALID_URL regexp.
  310     Probably, they should also be added to the list of extractors.
  311 
  312     Finally, the _WORKING attribute should be set to False for broken IEs
  313     in order to warn the users and skip the tests.
  314     """
  315 
  316     _ready = False
  317     _downloader = None
  318     _WORKING = True
  319 
  320     def __init__(self, downloader=None):
  321         """Constructor. Receives an optional downloader."""
  322         self._ready = False
  323         self.set_downloader(downloader)
  324 
  325     @classmethod
  326     def suitable(cls, url):
  327         """Receives a URL and returns True if suitable for this IE."""
  328 
  329         # This does not use has/getattr intentionally - we want to know whether
  330         # we have cached the regexp for *this* class, whereas getattr would also
  331         # match the superclass
  332         if '_VALID_URL_RE' not in cls.__dict__:
  333             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
  334         return cls._VALID_URL_RE.match(url) is not None
  335 
  336     @classmethod
  337     def _match_id(cls, url):
  338         if '_VALID_URL_RE' not in cls.__dict__:
  339             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
  340         m = cls._VALID_URL_RE.match(url)
  341         assert m
  342         return m.group('id')
  343 
  344     @classmethod
  345     def working(cls):
  346         """Getter method for _WORKING."""
  347         return cls._WORKING
  348 
  349     def initialize(self):
  350         """Initializes an instance (authentication, etc)."""
  351         if not self._ready:
  352             self._real_initialize()
  353             self._ready = True
  354 
  355     def extract(self, url):
  356         """Extracts URL information and returns it in list of dicts."""
  357         try:
  358             self.initialize()
  359             return self._real_extract(url)
  360         except ExtractorError:
  361             raise
  362         except compat_http_client.IncompleteRead as e:
  363             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
  364         except (KeyError, StopIteration) as e:
  365             raise ExtractorError('An extractor error has occurred.', cause=e)
  366 
  367     def set_downloader(self, downloader):
  368         """Sets the downloader for this IE."""
  369         self._downloader = downloader
  370 
  371     def _real_initialize(self):
  372         """Real initialization process. Redefine in subclasses."""
  373         pass
  374 
  375     def _real_extract(self, url):
  376         """Real extraction process. Redefine in subclasses."""
  377         pass
  378 
  379     @classmethod
  380     def ie_key(cls):
  381         """A string for getting the InfoExtractor with get_info_extractor"""
  382         return compat_str(cls.__name__[:-2])
  383 
  384     @property
  385     def IE_NAME(self):
  386         return compat_str(type(self).__name__[:-2])
  387 
  388     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
  389         """ Returns the response handle """
  390         if note is None:
  391             self.report_download_webpage(video_id)
  392         elif note is not False:
  393             if video_id is None:
  394                 self.to_screen('%s' % (note,))
  395             else:
  396                 self.to_screen('%s: %s' % (video_id, note))
  397         if isinstance(url_or_request, compat_urllib_request.Request):
  398             url_or_request = update_Request(
  399                 url_or_request, data=data, headers=headers, query=query)
  400         else:
  401             if query:
  402                 url_or_request = update_url_query(url_or_request, query)
  403             if data is not None or headers:
  404                 url_or_request = sanitized_Request(url_or_request, data, headers)
  405         try:
  406             return self._downloader.urlopen(url_or_request)
  407         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  408             if errnote is False:
  409                 return False
  410             if errnote is None:
  411                 errnote = 'Unable to download webpage'
  412 
  413             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
  414             if fatal:
  415                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
  416             else:
  417                 self._downloader.report_warning(errmsg)
  418                 return False
  419 
  420     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
  421         """ Returns a tuple (page content as string, URL handle) """
  422         # Strip hashes from the URL (#1038)
  423         if isinstance(url_or_request, (compat_str, str)):
  424             url_or_request = url_or_request.partition('#')[0]
  425 
  426         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
  427         if urlh is False:
  428             assert not fatal
  429             return False
  430         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
  431         return (content, urlh)
  432 
  433     @staticmethod
  434     def _guess_encoding_from_content(content_type, webpage_bytes):
  435         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
  436         if m:
  437             encoding = m.group(1)
  438         else:
  439             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
  440                           webpage_bytes[:1024])
  441             if m:
  442                 encoding = m.group(1).decode('ascii')
  443             elif webpage_bytes.startswith(b'\xff\xfe'):
  444                 encoding = 'utf-16'
  445             else:
  446                 encoding = 'utf-8'
  447 
  448         return encoding
  449 
  450     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
  451         content_type = urlh.headers.get('Content-Type', '')
  452         webpage_bytes = urlh.read()
  453         if prefix is not None:
  454             webpage_bytes = prefix + webpage_bytes
  455         if not encoding:
  456             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
  457         if self._downloader.params.get('dump_intermediate_pages', False):
  458             try:
  459                 url = url_or_request.get_full_url()
  460             except AttributeError:
  461                 url = url_or_request
  462             self.to_screen('Dumping request to ' + url)
  463             dump = base64.b64encode(webpage_bytes).decode('ascii')
  464             self._downloader.to_screen(dump)
  465         if self._downloader.params.get('write_pages', False):
  466             try:
  467                 url = url_or_request.get_full_url()
  468             except AttributeError:
  469                 url = url_or_request
  470             basen = '%s_%s' % (video_id, url)
  471             if len(basen) > 240:
  472                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
  473                 basen = basen[:240 - len(h)] + h
  474             raw_filename = basen + '.dump'
  475             filename = sanitize_filename(raw_filename, restricted=True)
  476             self.to_screen('Saving request to ' + filename)
  477             # Working around MAX_PATH limitation on Windows (see
  478             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
  479             if compat_os_name == 'nt':
  480                 absfilepath = os.path.abspath(filename)
  481                 if len(absfilepath) > 259:
  482                     filename = '\\\\?\\' + absfilepath
  483             with open(filename, 'wb') as outf:
  484                 outf.write(webpage_bytes)
  485 
  486         try:
  487             content = webpage_bytes.decode(encoding, 'replace')
  488         except LookupError:
  489             content = webpage_bytes.decode('utf-8', 'replace')
  490 
  491         if ('<title>Access to this site is blocked</title>' in content and
  492                 'Websense' in content[:512]):
  493             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
  494             blocked_iframe = self._html_search_regex(
  495                 r'<iframe src="([^"]+)"', content,
  496                 'Websense information URL', default=None)
  497             if blocked_iframe:
  498                 msg += ' Visit %s for more details' % blocked_iframe
  499             raise ExtractorError(msg, expected=True)
  500         if '<title>The URL you requested has been blocked</title>' in content[:512]:
  501             msg = (
  502                 'Access to this webpage has been blocked by Indian censorship. '
  503                 'Use a VPN or proxy server (with --proxy) to route around it.')
  504             block_msg = self._html_search_regex(
  505                 r'</h1><p>(.*?)</p>',
  506                 content, 'block message', default=None)
  507             if block_msg:
  508                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
  509             raise ExtractorError(msg, expected=True)
  510 
  511         return content
  512 
  513     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
  514         """ Returns the data of the page as a string """
  515         success = False
  516         try_count = 0
  517         while success is False:
  518             try:
  519                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
  520                 success = True
  521             except compat_http_client.IncompleteRead as e:
  522                 try_count += 1
  523                 if try_count >= tries:
  524                     raise e
  525                 self._sleep(timeout, video_id)
  526         if res is False:
  527             return res
  528         else:
  529             content, _ = res
  530             return content
  531 
  532     def _download_xml(self, url_or_request, video_id,
  533                       note='Downloading XML', errnote='Unable to download XML',
  534                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
  535         """Return the xml as an xml.etree.ElementTree.Element"""
  536         xml_string = self._download_webpage(
  537             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
  538         if xml_string is False:
  539             return xml_string
  540         if transform_source:
  541             xml_string = transform_source(xml_string)
  542         return compat_etree_fromstring(xml_string.encode('utf-8'))
  543 
  544     def _download_json(self, url_or_request, video_id,
  545                        note='Downloading JSON metadata',
  546                        errnote='Unable to download JSON metadata',
  547                        transform_source=None,
  548                        fatal=True, encoding=None, data=None, headers={}, query={}):
  549         json_string = self._download_webpage(
  550             url_or_request, video_id, note, errnote, fatal=fatal,
  551             encoding=encoding, data=data, headers=headers, query=query)
  552         if (not fatal) and json_string is False:
  553             return None
  554         return self._parse_json(
  555             json_string, video_id, transform_source=transform_source, fatal=fatal)
  556 
  557     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
  558         if transform_source:
  559             json_string = transform_source(json_string)
  560         try:
  561             return json.loads(json_string)
  562         except ValueError as ve:
  563             errmsg = '%s: Failed to parse JSON ' % video_id
  564             if fatal:
  565                 raise ExtractorError(errmsg, cause=ve)
  566             else:
  567                 self.report_warning(errmsg + str(ve))
  568 
  569     def report_warning(self, msg, video_id=None):
  570         idstr = '' if video_id is None else '%s: ' % video_id
  571         self._downloader.report_warning(
  572             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
  573 
  574     def to_screen(self, msg):
  575         """Print msg to screen, prefixing it with '[ie_name]'"""
  576         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
  577 
  578     def report_extraction(self, id_or_name):
  579         """Report information extraction."""
  580         self.to_screen('%s: Extracting information' % id_or_name)
  581 
  582     def report_download_webpage(self, video_id):
  583         """Report webpage download."""
  584         self.to_screen('%s: Downloading webpage' % video_id)
  585 
  586     def report_age_confirmation(self):
  587         """Report attempt to confirm age."""
  588         self.to_screen('Confirming age')
  589 
  590     def report_login(self):
  591         """Report attempt to log in."""
  592         self.to_screen('Logging in')
  593 
  594     @staticmethod
  595     def raise_login_required(msg='This video is only available for registered users'):
  596         raise ExtractorError(
  597             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
  598             expected=True)
  599 
  600     @staticmethod
  601     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
  602         raise ExtractorError(
  603             '%s. You might want to use --proxy to workaround.' % msg,
  604             expected=True)
  605 
  606     # Methods for following #608
  607     @staticmethod
  608     def url_result(url, ie=None, video_id=None, video_title=None):
  609         """Returns a URL that points to a page that should be processed"""
  610         # TODO: ie should be the class used for getting the info
  611         video_info = {'_type': 'url',
  612                       'url': url,
  613                       'ie_key': ie}
  614         if video_id is not None:
  615             video_info['id'] = video_id
  616         if video_title is not None:
  617             video_info['title'] = video_title
  618         return video_info
  619 
  620     @staticmethod
  621     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
  622         """Returns a playlist"""
  623         video_info = {'_type': 'playlist',
  624                       'entries': entries}
  625         if playlist_id:
  626             video_info['id'] = playlist_id
  627         if playlist_title:
  628             video_info['title'] = playlist_title
  629         if playlist_description:
  630             video_info['description'] = playlist_description
  631         return video_info
  632 
  633     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
  634         """
  635         Perform a regex search on the given string, using a single or a list of
  636         patterns returning the first matching group.
  637         In case of failure return a default value or raise a WARNING or a
  638         RegexNotFoundError, depending on fatal, specifying the field name.
  639         """
  640         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
  641             mobj = re.search(pattern, string, flags)
  642         else:
  643             for p in pattern:
  644                 mobj = re.search(p, string, flags)
  645                 if mobj:
  646                     break
  647 
  648         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
  649             _name = '\033[0;34m%s\033[0m' % name
  650         else:
  651             _name = name
  652 
  653         if mobj:
  654             if group is None:
  655                 # return the first matching group
  656                 return next(g for g in mobj.groups() if g is not None)
  657             else:
  658                 return mobj.group(group)
  659         elif default is not NO_DEFAULT:
  660             return default
  661         elif fatal:
  662             raise RegexNotFoundError('Unable to extract %s' % _name)
  663         else:
  664             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
  665             return None
  666 
  667     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
  668         """
  669         Like _search_regex, but strips HTML tags and unescapes entities.
  670         """
  671         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
  672         if res:
  673             return clean_html(res).strip()
  674         else:
  675             return res
  676 
  677     def _get_netrc_login_info(self, netrc_machine=None):
  678         username = None
  679         password = None
  680         netrc_machine = netrc_machine or self._NETRC_MACHINE
  681 
  682         if self._downloader.params.get('usenetrc', False):
  683             try:
  684                 info = netrc.netrc().authenticators(netrc_machine)
  685                 if info is not None:
  686                     username = info[0]
  687                     password = info[2]
  688                 else:
  689                     raise netrc.NetrcParseError(
  690                         'No authenticators for %s' % netrc_machine)
  691             except (IOError, netrc.NetrcParseError) as err:
  692                 self._downloader.report_warning(
  693                     'parsing .netrc: %s' % error_to_compat_str(err))
  694 
  695         return username, password
  696 
  697     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
  698         """
  699         Get the login info as (username, password)
  700         First look for the manually specified credentials using username_option
  701         and password_option as keys in params dictionary. If no such credentials
  702         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
  703         value.
  704         If there's no info available, return (None, None)
  705         """
  706         if self._downloader is None:
  707             return (None, None)
  708 
  709         downloader_params = self._downloader.params
  710 
  711         # Attempt to use provided username and password or .netrc data
  712         if downloader_params.get(username_option) is not None:
  713             username = downloader_params[username_option]
  714             password = downloader_params[password_option]
  715         else:
  716             username, password = self._get_netrc_login_info(netrc_machine)
  717 
  718         return username, password
  719 
  720     def _get_tfa_info(self, note='two-factor verification code'):
  721         """
  722         Get the two-factor authentication info
  723         TODO - asking the user will be required for sms/phone verify
  724         currently just uses the command line option
  725         If there's no info available, return None
  726         """
  727         if self._downloader is None:
  728             return None
  729         downloader_params = self._downloader.params
  730 
  731         if downloader_params.get('twofactor') is not None:
  732             return downloader_params['twofactor']
  733 
  734         return compat_getpass('Type %s and press [Return]: ' % note)
  735 
  736     # Helper functions for extracting OpenGraph info
  737     @staticmethod
  738     def _og_regexes(prop):
  739         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
  740         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
  741                        % {'prop': re.escape(prop)})
  742         template = r'<meta[^>]+?%s[^>]+?%s'
  743         return [
  744             template % (property_re, content_re),
  745             template % (content_re, property_re),
  746         ]
  747 
  748     @staticmethod
  749     def _meta_regex(prop):
  750         return r'''(?isx)<meta
  751                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
  752                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
  753 
  754     def _og_search_property(self, prop, html, name=None, **kargs):
  755         if not isinstance(prop, (list, tuple)):
  756             prop = [prop]
  757         if name is None:
  758             name = 'OpenGraph %s' % prop[0]
  759         og_regexes = []
  760         for p in prop:
  761             og_regexes.extend(self._og_regexes(p))
  762         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
  763         if escaped is None:
  764             return None
  765         return unescapeHTML(escaped)
  766 
  767     def _og_search_thumbnail(self, html, **kargs):
  768         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
  769 
  770     def _og_search_description(self, html, **kargs):
  771         return self._og_search_property('description', html, fatal=False, **kargs)
  772 
  773     def _og_search_title(self, html, **kargs):
  774         return self._og_search_property('title', html, **kargs)
  775 
  776     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
  777         regexes = self._og_regexes('video') + self._og_regexes('video:url')
  778         if secure:
  779             regexes = self._og_regexes('video:secure_url') + regexes
  780         return self._html_search_regex(regexes, html, name, **kargs)
  781 
  782     def _og_search_url(self, html, **kargs):
  783         return self._og_search_property('url', html, **kargs)
  784 
  785     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
  786         if not isinstance(name, (list, tuple)):
  787             name = [name]
  788         if display_name is None:
  789             display_name = name[0]
  790         return self._html_search_regex(
  791             [self._meta_regex(n) for n in name],
  792             html, display_name, fatal=fatal, group='content', **kwargs)
  793 
  794     def _dc_search_uploader(self, html):
  795         return self._html_search_meta('dc.creator', html, 'uploader')
  796 
  797     def _rta_search(self, html):
  798         # See http://www.rtalabel.org/index.php?content=howtofaq#single
  799         if re.search(r'(?ix)<meta\s+name="rating"\s+'
  800                      r'     content="RTA-5042-1996-1400-1577-RTA"',
  801                      html):
  802             return 18
  803         return 0
  804 
  805     def _media_rating_search(self, html):
  806         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
  807         rating = self._html_search_meta('rating', html)
  808 
  809         if not rating:
  810             return None
  811 
  812         RATING_TABLE = {
  813             'safe for kids': 0,
  814             'general': 8,
  815             '14 years': 14,
  816             'mature': 17,
  817             'restricted': 19,
  818         }
  819         return RATING_TABLE.get(rating.lower())
  820 
  821     def _family_friendly_search(self, html):
  822         # See http://schema.org/VideoObject
  823         family_friendly = self._html_search_meta('isFamilyFriendly', html)
  824 
  825         if not family_friendly:
  826             return None
  827 
  828         RATING_TABLE = {
  829             '1': 0,
  830             'true': 0,
  831             '0': 18,
  832             'false': 18,
  833         }
  834         return RATING_TABLE.get(family_friendly.lower())
  835 
  836     def _twitter_search_player(self, html):
  837         return self._html_search_meta('twitter:player', html,
  838                                       'twitter card player')
  839 
  840     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
  841         json_ld = self._search_regex(
  842             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
  843             html, 'JSON-LD', group='json_ld', **kwargs)
  844         default = kwargs.get('default', NO_DEFAULT)
  845         if not json_ld:
  846             return default if default is not NO_DEFAULT else {}
  847         # JSON-LD may be malformed and thus `fatal` should be respected.
  848         # At the same time `default` may be passed that assumes `fatal=False`
  849         # for _search_regex. Let's simulate the same behavior here as well.
  850         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
  851         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
  852 
  853     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
  854         if isinstance(json_ld, compat_str):
  855             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
  856         if not json_ld:
  857             return {}
  858         info = {}
  859         if not isinstance(json_ld, (list, tuple, dict)):
  860             return info
  861         if isinstance(json_ld, dict):
  862             json_ld = [json_ld]
  863         for e in json_ld:
  864             if e.get('@context') == 'http://schema.org':
  865                 item_type = e.get('@type')
  866                 if expected_type is not None and expected_type != item_type:
  867                     return info
  868                 if item_type == 'TVEpisode':
  869                     info.update({
  870                         'episode': unescapeHTML(e.get('name')),
  871                         'episode_number': int_or_none(e.get('episodeNumber')),
  872                         'description': unescapeHTML(e.get('description')),
  873                     })
  874                     part_of_season = e.get('partOfSeason')
  875                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
  876                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
  877                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
  878                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
  879                         info['series'] = unescapeHTML(part_of_series.get('name'))
  880                 elif item_type == 'Article':
  881                     info.update({
  882                         'timestamp': parse_iso8601(e.get('datePublished')),
  883                         'title': unescapeHTML(e.get('headline')),
  884                         'description': unescapeHTML(e.get('articleBody')),
  885                     })
  886                 elif item_type == 'VideoObject':
  887                     info.update({
  888                         'url': e.get('contentUrl'),
  889                         'title': unescapeHTML(e.get('name')),
  890                         'description': unescapeHTML(e.get('description')),
  891                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
  892                         'duration': parse_duration(e.get('duration')),
  893                         'timestamp': unified_timestamp(e.get('uploadDate')),
  894                         'filesize': float_or_none(e.get('contentSize')),
  895                         'tbr': int_or_none(e.get('bitrate')),
  896                         'width': int_or_none(e.get('width')),
  897                         'height': int_or_none(e.get('height')),
  898                     })
  899                 break
  900         return dict((k, v) for k, v in info.items() if v is not None)
  901 
  902     @staticmethod
  903     def _hidden_inputs(html):
  904         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
  905         hidden_inputs = {}
  906         for input in re.findall(r'(?i)(<input[^>]+>)', html):
  907             attrs = extract_attributes(input)
  908             if not input:
  909                 continue
  910             if attrs.get('type') not in ('hidden', 'submit'):
  911                 continue
  912             name = attrs.get('name') or attrs.get('id')
  913             value = attrs.get('value')
  914             if name and value is not None:
  915                 hidden_inputs[name] = value
  916         return hidden_inputs
  917 
  918     def _form_hidden_inputs(self, form_id, html):
  919         form = self._search_regex(
  920             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
  921             html, '%s form' % form_id, group='form')
  922         return self._hidden_inputs(form)
  923 
  924     def _sort_formats(self, formats, field_preference=None):
  925         if not formats:
  926             raise ExtractorError('No video formats found')
  927 
  928         for f in formats:
  929             # Automatically determine tbr when missing based on abr and vbr (improves
  930             # formats sorting in some cases)
  931             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
  932                 f['tbr'] = f['abr'] + f['vbr']
  933 
  934         def _formats_key(f):
  935             # TODO remove the following workaround
  936             from ..utils import determine_ext
  937             if not f.get('ext') and 'url' in f:
  938                 f['ext'] = determine_ext(f['url'])
  939 
  940             if isinstance(field_preference, (list, tuple)):
  941                 return tuple(
  942                     f.get(field)
  943                     if f.get(field) is not None
  944                     else ('' if field == 'format_id' else -1)
  945                     for field in field_preference)
  946 
  947             preference = f.get('preference')
  948             if preference is None:
  949                 preference = 0
  950                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
  951                     preference -= 0.5
  952 
  953             protocol = f.get('protocol') or determine_protocol(f)
  954             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
  955 
  956             if f.get('vcodec') == 'none':  # audio only
  957                 preference -= 50
  958                 if self._downloader.params.get('prefer_free_formats'):
  959                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
  960                 else:
  961                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
  962                 ext_preference = 0
  963                 try:
  964                     audio_ext_preference = ORDER.index(f['ext'])
  965                 except ValueError:
  966                     audio_ext_preference = -1
  967             else:
  968                 if f.get('acodec') == 'none':  # video only
  969                     preference -= 40
  970                 if self._downloader.params.get('prefer_free_formats'):
  971                     ORDER = ['flv', 'mp4', 'webm']
  972                 else:
  973                     ORDER = ['webm', 'flv', 'mp4']
  974                 try:
  975                     ext_preference = ORDER.index(f['ext'])
  976                 except ValueError:
  977                     ext_preference = -1
  978                 audio_ext_preference = 0
  979 
  980             return (
  981                 preference,
  982                 f.get('language_preference') if f.get('language_preference') is not None else -1,
  983                 f.get('quality') if f.get('quality') is not None else -1,
  984                 f.get('tbr') if f.get('tbr') is not None else -1,
  985                 f.get('filesize') if f.get('filesize') is not None else -1,
  986                 f.get('vbr') if f.get('vbr') is not None else -1,
  987                 f.get('height') if f.get('height') is not None else -1,
  988                 f.get('width') if f.get('width') is not None else -1,
  989                 proto_preference,
  990                 ext_preference,
  991                 f.get('abr') if f.get('abr') is not None else -1,
  992                 audio_ext_preference,
  993                 f.get('fps') if f.get('fps') is not None else -1,
  994                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
  995                 f.get('source_preference') if f.get('source_preference') is not None else -1,
  996                 f.get('format_id') if f.get('format_id') is not None else '',
  997             )
  998         formats.sort(key=_formats_key)
  999 
 1000     def _check_formats(self, formats, video_id):
 1001         if formats:
 1002             formats[:] = filter(
 1003                 lambda f: self._is_valid_url(
 1004                     f['url'], video_id,
 1005                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 1006                 formats)
 1007 
 1008     @staticmethod
 1009     def _remove_duplicate_formats(formats):
 1010         format_urls = set()
 1011         unique_formats = []
 1012         for f in formats:
 1013             if f['url'] not in format_urls:
 1014                 format_urls.add(f['url'])
 1015                 unique_formats.append(f)
 1016         formats[:] = unique_formats
 1017 
 1018     def _is_valid_url(self, url, video_id, item='video'):
 1019         url = self._proto_relative_url(url, scheme='http:')
 1020         # For now assume non HTTP(S) URLs always valid
 1021         if not (url.startswith('http://') or url.startswith('https://')):
 1022             return True
 1023         try:
 1024             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 1025             return True
 1026         except ExtractorError as e:
 1027             if isinstance(e.cause, compat_urllib_error.URLError):
 1028                 self.to_screen(
 1029                     '%s: %s URL is invalid, skipping' % (video_id, item))
 1030                 return False
 1031             raise
 1032 
 1033     def http_scheme(self):
 1034         """ Either "http:" or "https:", depending on the user's preferences """
 1035         return (
 1036             'http:'
 1037             if self._downloader.params.get('prefer_insecure', False)
 1038             else 'https:')
 1039 
 1040     def _proto_relative_url(self, url, scheme=None):
 1041         if url is None:
 1042             return url
 1043         if url.startswith('//'):
 1044             if scheme is None:
 1045                 scheme = self.http_scheme()
 1046             return scheme + url
 1047         else:
 1048             return url
 1049 
 1050     def _sleep(self, timeout, video_id, msg_template=None):
 1051         if msg_template is None:
 1052             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 1053         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 1054         self.to_screen(msg)
 1055         time.sleep(timeout)
 1056 
 1057     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 1058                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 1059                              fatal=True, m3u8_id=None):
 1060         manifest = self._download_xml(
 1061             manifest_url, video_id, 'Downloading f4m manifest',
 1062             'Unable to download f4m manifest',
 1063             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 1064             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 1065             transform_source=transform_source,
 1066             fatal=fatal)
 1067 
 1068         if manifest is False:
 1069             return []
 1070 
 1071         return self._parse_f4m_formats(
 1072             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 1073             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
 1074 
 1075     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
 1076                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
 1077                            fatal=True, m3u8_id=None):
 1078         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
 1079         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
 1080         if akamai_pv is not None and ';' in akamai_pv.text:
 1081             playerVerificationChallenge = akamai_pv.text.split(';')[0]
 1082             if playerVerificationChallenge.strip() != '':
 1083                 return []
 1084 
 1085         formats = []
 1086         manifest_version = '1.0'
 1087         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 1088         if not media_nodes:
 1089             manifest_version = '2.0'
 1090             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 1091         # Remove unsupported DRM protected media from final formats
 1092         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
 1093         media_nodes = remove_encrypted_media(media_nodes)
 1094         if not media_nodes:
 1095             return formats
 1096         base_url = xpath_text(
 1097             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 1098             'base URL', default=None)
 1099         if base_url:
 1100             base_url = base_url.strip()
 1101 
 1102         bootstrap_info = xpath_element(
 1103             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
 1104             'bootstrap info', default=None)
 1105 
 1106         vcodec = None
 1107         mime_type = xpath_text(
 1108             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
 1109             'base URL', default=None)
 1110         if mime_type and mime_type.startswith('audio/'):
 1111             vcodec = 'none'
 1112 
 1113         for i, media_el in enumerate(media_nodes):
 1114             tbr = int_or_none(media_el.attrib.get('bitrate'))
 1115             width = int_or_none(media_el.attrib.get('width'))
 1116             height = int_or_none(media_el.attrib.get('height'))
 1117             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
 1118             # If <bootstrapInfo> is present, the specified f4m is a
 1119             # stream-level manifest, and only set-level manifests may refer to
 1120             # external resources.  See section 11.4 and section 4 of F4M spec
 1121             if bootstrap_info is None:
 1122                 media_url = None
 1123                 # @href is introduced in 2.0, see section 11.6 of F4M spec
 1124                 if manifest_version == '2.0':
 1125                     media_url = media_el.attrib.get('href')
 1126                 if media_url is None:
 1127                     media_url = media_el.attrib.get('url')
 1128                 if not media_url:
 1129                     continue
 1130                 manifest_url = (
 1131                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 1132                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 1133                 # If media_url is itself a f4m manifest do the recursive extraction
 1134                 # since bitrates in parent manifest (this one) and media_url manifest
 1135                 # may differ leading to inability to resolve the format by requested
 1136                 # bitrate in f4m downloader
 1137                 ext = determine_ext(manifest_url)
 1138                 if ext == 'f4m':
 1139                     f4m_formats = self._extract_f4m_formats(
 1140                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 1141                         transform_source=transform_source, fatal=fatal)
 1142                     # Sometimes stream-level manifest contains single media entry that
 1143                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
 1144                     # At the same time parent's media entry in set-level manifest may
 1145                     # contain it. We will copy it from parent in such cases.
 1146                     if len(f4m_formats) == 1:
 1147                         f = f4m_formats[0]
 1148                         f.update({
 1149                             'tbr': f.get('tbr') or tbr,
 1150                             'width': f.get('width') or width,
 1151                             'height': f.get('height') or height,
 1152                             'format_id': f.get('format_id') if not tbr else format_id,
 1153                             'vcodec': vcodec,
 1154                         })
 1155                     formats.extend(f4m_formats)
 1156                     continue
 1157                 elif ext == 'm3u8':
 1158                     formats.extend(self._extract_m3u8_formats(
 1159                         manifest_url, video_id, 'mp4', preference=preference,
 1160                         m3u8_id=m3u8_id, fatal=fatal))
 1161                     continue
 1162             formats.append({
 1163                 'format_id': format_id,
 1164                 'url': manifest_url,
 1165                 'manifest_url': manifest_url,
 1166                 'ext': 'flv' if bootstrap_info is not None else None,
 1167                 'tbr': tbr,
 1168                 'width': width,
 1169                 'height': height,
 1170                 'vcodec': vcodec,
 1171                 'preference': preference,
 1172             })
 1173         return formats
 1174 
 1175     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
 1176         return {
 1177             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 1178             'url': m3u8_url,
 1179             'ext': ext,
 1180             'protocol': 'm3u8',
 1181             'preference': preference - 100 if preference else -100,
 1182             'resolution': 'multiple',
 1183             'format_note': 'Quality selection URL',
 1184         }
 1185 
 1186     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 1187                               entry_protocol='m3u8', preference=None,
 1188                               m3u8_id=None, note=None, errnote=None,
 1189                               fatal=True, live=False):
 1190 
 1191         res = self._download_webpage_handle(
 1192             m3u8_url, video_id,
 1193             note=note or 'Downloading m3u8 information',
 1194             errnote=errnote or 'Failed to download m3u8 information',
 1195             fatal=fatal)
 1196         if res is False:
 1197             return []
 1198         m3u8_doc, urlh = res
 1199         m3u8_url = urlh.geturl()
 1200 
 1201         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
 1202 
 1203         format_url = lambda u: (
 1204             u
 1205             if re.match(r'^https?://', u)
 1206             else compat_urlparse.urljoin(m3u8_url, u))
 1207 
 1208         # We should try extracting formats only from master playlists [1], i.e.
 1209         # playlists that describe available qualities. On the other hand media
 1210         # playlists [2] should be returned as is since they contain just the media
 1211         # without qualities renditions.
 1212         # Fortunately, master playlist can be easily distinguished from media
 1213         # playlist based on particular tags availability. As of [1, 2] master
 1214         # playlist tags MUST NOT appear in a media playist and vice versa.
 1215         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
 1216         # and MUST NOT appear in master playlist thus we can clearly detect media
 1217         # playlist with this criterion.
 1218         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
 1219         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
 1220         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
 1221         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
 1222             return [{
 1223                 'url': m3u8_url,
 1224                 'format_id': m3u8_id,
 1225                 'ext': ext,
 1226                 'protocol': entry_protocol,
 1227                 'preference': preference,
 1228             }]
 1229         audio_in_video_stream = {}
 1230         last_info = {}
 1231         last_media = {}
 1232         for line in m3u8_doc.splitlines():
 1233             if line.startswith('#EXT-X-STREAM-INF:'):
 1234                 last_info = parse_m3u8_attributes(line)
 1235             elif line.startswith('#EXT-X-MEDIA:'):
 1236                 media = parse_m3u8_attributes(line)
 1237                 media_type = media.get('TYPE')
 1238                 if media_type in ('VIDEO', 'AUDIO'):
 1239                     group_id = media.get('GROUP-ID')
 1240                     media_url = media.get('URI')
 1241                     if media_url:
 1242                         format_id = []
 1243                         for v in (group_id, media.get('NAME')):
 1244                             if v:
 1245                                 format_id.append(v)
 1246                         f = {
 1247                             'format_id': '-'.join(format_id),
 1248                             'url': format_url(media_url),
 1249                             'language': media.get('LANGUAGE'),
 1250                             'ext': ext,
 1251                             'protocol': entry_protocol,
 1252                             'preference': preference,
 1253                         }
 1254                         if media_type == 'AUDIO':
 1255                             f['vcodec'] = 'none'
 1256                             if group_id and not audio_in_video_stream.get(group_id):
 1257                                 audio_in_video_stream[group_id] = False
 1258                         formats.append(f)
 1259                     else:
 1260                         # When there is no URI in EXT-X-MEDIA let this tag's
 1261                         # data be used by regular URI lines below
 1262                         last_media = media
 1263                         if media_type == 'AUDIO' and group_id:
 1264                             audio_in_video_stream[group_id] = True
 1265             elif line.startswith('#') or not line.strip():
 1266                 continue
 1267             else:
 1268                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
 1269                 format_id = []
 1270                 if m3u8_id:
 1271                     format_id.append(m3u8_id)
 1272                 # Despite specification does not mention NAME attribute for
 1273                 # EXT-X-STREAM-INF it still sometimes may be present
 1274                 stream_name = last_info.get('NAME') or last_media.get('NAME')
 1275                 # Bandwidth of live streams may differ over time thus making
 1276                 # format_id unpredictable. So it's better to keep provided
 1277                 # format_id intact.
 1278                 if not live:
 1279                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
 1280                 manifest_url = format_url(line.strip())
 1281                 f = {
 1282                     'format_id': '-'.join(format_id),
 1283                     'url': manifest_url,
 1284                     'manifest_url': manifest_url,
 1285                     'tbr': tbr,
 1286                     'ext': ext,
 1287                     'fps': float_or_none(last_info.get('FRAME-RATE')),
 1288                     'protocol': entry_protocol,
 1289                     'preference': preference,
 1290                 }
 1291                 resolution = last_info.get('RESOLUTION')
 1292                 if resolution:
 1293                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
 1294                     if mobj:
 1295                         f['width'] = int(mobj.group('width'))
 1296                         f['height'] = int(mobj.group('height'))
 1297                 # Unified Streaming Platform
 1298                 mobj = re.search(
 1299                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
 1300                 if mobj:
 1301                     abr, vbr = mobj.groups()
 1302                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
 1303                     f.update({
 1304                         'vbr': vbr,
 1305                         'abr': abr,
 1306                     })
 1307                 f.update(parse_codecs(last_info.get('CODECS')))
 1308                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False:
 1309                     # TODO: update acodec for for audio only formats with the same GROUP-ID
 1310                     f['acodec'] = 'none'
 1311                 formats.append(f)
 1312                 last_info = {}
 1313                 last_media = {}
 1314         return formats
 1315 
 1316     @staticmethod
 1317     def _xpath_ns(path, namespace=None):
 1318         if not namespace:
 1319             return path
 1320         out = []
 1321         for c in path.split('/'):
 1322             if not c or c == '.':
 1323                 out.append(c)
 1324             else:
 1325                 out.append('{%s}%s' % (namespace, c))
 1326         return '/'.join(out)
 1327 
 1328     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
 1329         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
 1330 
 1331         if smil is False:
 1332             assert not fatal
 1333             return []
 1334 
 1335         namespace = self._parse_smil_namespace(smil)
 1336 
 1337         return self._parse_smil_formats(
 1338             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
 1339 
 1340     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
 1341         smil = self._download_smil(smil_url, video_id, fatal=fatal)
 1342         if smil is False:
 1343             return {}
 1344         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
 1345 
 1346     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
 1347         return self._download_xml(
 1348             smil_url, video_id, 'Downloading SMIL file',
 1349             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
 1350 
 1351     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
 1352         namespace = self._parse_smil_namespace(smil)
 1353 
 1354         formats = self._parse_smil_formats(
 1355             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
 1356         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
 1357 
 1358         video_id = os.path.splitext(url_basename(smil_url))[0]
 1359         title = None
 1360         description = None
 1361         upload_date = None
 1362         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 1363             name = meta.attrib.get('name')
 1364             content = meta.attrib.get('content')
 1365             if not name or not content:
 1366                 continue
 1367             if not title and name == 'title':
 1368                 title = content
 1369             elif not description and name in ('description', 'abstract'):
 1370                 description = content
 1371             elif not upload_date and name == 'date':
 1372                 upload_date = unified_strdate(content)
 1373 
 1374         thumbnails = [{
 1375             'id': image.get('type'),
 1376             'url': image.get('src'),
 1377             'width': int_or_none(image.get('width')),
 1378             'height': int_or_none(image.get('height')),
 1379         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
 1380 
 1381         return {
 1382             'id': video_id,
 1383             'title': title or video_id,
 1384             'description': description,
 1385             'upload_date': upload_date,
 1386             'thumbnails': thumbnails,
 1387             'formats': formats,
 1388             'subtitles': subtitles,
 1389         }
 1390 
 1391     def _parse_smil_namespace(self, smil):
 1392         return self._search_regex(
 1393             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
 1394 
 1395     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
 1396         base = smil_url
 1397         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 1398             b = meta.get('base') or meta.get('httpBase')
 1399             if b:
 1400                 base = b
 1401                 break
 1402 
 1403         formats = []
 1404         rtmp_count = 0
 1405         http_count = 0
 1406         m3u8_count = 0
 1407 
 1408         srcs = []
 1409         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
 1410         for medium in media:
 1411             src = medium.get('src')
 1412             if not src or src in srcs:
 1413                 continue
 1414             srcs.append(src)
 1415 
 1416             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
 1417             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
 1418             width = int_or_none(medium.get('width'))
 1419             height = int_or_none(medium.get('height'))
 1420             proto = medium.get('proto')
 1421             ext = medium.get('ext')
 1422             src_ext = determine_ext(src)
 1423             streamer = medium.get('streamer') or base
 1424 
 1425             if proto == 'rtmp' or streamer.startswith('rtmp'):
 1426                 rtmp_count += 1
 1427                 formats.append({
 1428                     'url': streamer,
 1429                     'play_path': src,
 1430                     'ext': 'flv',
 1431                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 1432                     'tbr': bitrate,
 1433                     'filesize': filesize,
 1434                     'width': width,
 1435                     'height': height,
 1436                 })
 1437                 if transform_rtmp_url:
 1438                     streamer, src = transform_rtmp_url(streamer, src)
 1439                     formats[-1].update({
 1440                         'url': streamer,
 1441                         'play_path': src,
 1442                     })
 1443                 continue
 1444 
 1445             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
 1446             src_url = src_url.strip()
 1447 
 1448             if proto == 'm3u8' or src_ext == 'm3u8':
 1449                 m3u8_formats = self._extract_m3u8_formats(
 1450                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
 1451                 if len(m3u8_formats) == 1:
 1452                     m3u8_count += 1
 1453                     m3u8_formats[0].update({
 1454                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
 1455                         'tbr': bitrate,
 1456                         'width': width,
 1457                         'height': height,
 1458                     })
 1459                 formats.extend(m3u8_formats)
 1460                 continue
 1461 
 1462             if src_ext == 'f4m':
 1463                 f4m_url = src_url
 1464                 if not f4m_params:
 1465                     f4m_params = {
 1466                         'hdcore': '3.2.0',
 1467                         'plugin': 'flowplayer-3.2.0.1',
 1468                     }
 1469                 f4m_url += '&' if '?' in f4m_url else '?'
 1470                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
 1471                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
 1472                 continue
 1473 
 1474             if src_url.startswith('http') and self._is_valid_url(src, video_id):
 1475                 http_count += 1
 1476                 formats.append({
 1477                     'url': src_url,
 1478                     'ext': ext or src_ext or 'flv',
 1479                     'format_id': 'http-%d' % (bitrate or http_count),
 1480                     'tbr': bitrate,
 1481                     'filesize': filesize,
 1482                     'width': width,
 1483                     'height': height,
 1484                 })
 1485                 continue
 1486 
 1487         return formats
 1488 
 1489     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
 1490         urls = []
 1491         subtitles = {}
 1492         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
 1493             src = textstream.get('src')
 1494             if not src or src in urls:
 1495                 continue
 1496             urls.append(src)
 1497             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
 1498             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
 1499             subtitles.setdefault(lang, []).append({
 1500                 'url': src,
 1501                 'ext': ext,
 1502             })
 1503         return subtitles
 1504 
 1505     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
 1506         xspf = self._download_xml(
 1507             playlist_url, playlist_id, 'Downloading xpsf playlist',
 1508             'Unable to download xspf manifest', fatal=fatal)
 1509         if xspf is False:
 1510             return []
 1511         return self._parse_xspf(xspf, playlist_id)
 1512 
 1513     def _parse_xspf(self, playlist, playlist_id):
 1514         NS_MAP = {
 1515             'xspf': 'http://xspf.org/ns/0/',
 1516             's1': 'http://static.streamone.nl/player/ns/0',
 1517         }
 1518 
 1519         entries = []
 1520         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
 1521             title = xpath_text(
 1522                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
 1523             description = xpath_text(
 1524                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
 1525             thumbnail = xpath_text(
 1526                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
 1527             duration = float_or_none(
 1528                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
 1529 
 1530             formats = [{
 1531                 'url': location.text,
 1532                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
 1533                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
 1534                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
 1535             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
 1536             self._sort_formats(formats)
 1537 
 1538             entries.append({
 1539                 'id': playlist_id,
 1540                 'title': title,
 1541                 'description': description,
 1542                 'thumbnail': thumbnail,
 1543                 'duration': duration,
 1544                 'formats': formats,
 1545             })
 1546         return entries
 1547 
 1548     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
 1549         res = self._download_webpage_handle(
 1550             mpd_url, video_id,
 1551             note=note or 'Downloading MPD manifest',
 1552             errnote=errnote or 'Failed to download MPD manifest',
 1553             fatal=fatal)
 1554         if res is False:
 1555             return []
 1556         mpd, urlh = res
 1557         mpd_base_url = base_url(urlh.geturl())
 1558 
 1559         return self._parse_mpd_formats(
 1560             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
 1561             formats_dict=formats_dict, mpd_url=mpd_url)
 1562 
 1563     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
 1564         """
 1565         Parse formats from MPD manifest.
 1566         References:
 1567          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
 1568             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
 1569          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
 1570         """
 1571         if mpd_doc.get('type') == 'dynamic':
 1572             return []
 1573 
 1574         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
 1575 
 1576         def _add_ns(path):
 1577             return self._xpath_ns(path, namespace)
 1578 
 1579         def is_drm_protected(element):
 1580             return element.find(_add_ns('ContentProtection')) is not None
 1581 
 1582         def extract_multisegment_info(element, ms_parent_info):
 1583             ms_info = ms_parent_info.copy()
 1584 
 1585             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
 1586             # common attributes and elements.  We will only extract relevant
 1587             # for us.
 1588             def extract_common(source):
 1589                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
 1590                 if segment_timeline is not None:
 1591                     s_e = segment_timeline.findall(_add_ns('S'))
 1592                     if s_e:
 1593                         ms_info['total_number'] = 0
 1594                         ms_info['s'] = []
 1595                         for s in s_e:
 1596                             r = int(s.get('r', 0))
 1597                             ms_info['total_number'] += 1 + r
 1598                             ms_info['s'].append({
 1599                                 't': int(s.get('t', 0)),
 1600                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
 1601                                 'd': int(s.attrib['d']),
 1602                                 'r': r,
 1603                             })
 1604                 start_number = source.get('startNumber')
 1605                 if start_number:
 1606                     ms_info['start_number'] = int(start_number)
 1607                 timescale = source.get('timescale')
 1608                 if timescale:
 1609                     ms_info['timescale'] = int(timescale)
 1610                 segment_duration = source.get('duration')
 1611                 if segment_duration:
 1612                     ms_info['segment_duration'] = int(segment_duration)
 1613 
 1614             def extract_Initialization(source):
 1615                 initialization = source.find(_add_ns('Initialization'))
 1616                 if initialization is not None:
 1617                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
 1618 
 1619             segment_list = element.find(_add_ns('SegmentList'))
 1620             if segment_list is not None:
 1621                 extract_common(segment_list)
 1622                 extract_Initialization(segment_list)
 1623                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
 1624                 if segment_urls_e:
 1625                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
 1626             else:
 1627                 segment_template = element.find(_add_ns('SegmentTemplate'))
 1628                 if segment_template is not None:
 1629                     extract_common(segment_template)
 1630                     media_template = segment_template.get('media')
 1631                     if media_template:
 1632                         ms_info['media_template'] = media_template
 1633                     initialization = segment_template.get('initialization')
 1634                     if initialization:
 1635                         ms_info['initialization_url'] = initialization
 1636                     else:
 1637                         extract_Initialization(segment_template)
 1638             return ms_info
 1639 
 1640         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
 1641         formats = []
 1642         for period in mpd_doc.findall(_add_ns('Period')):
 1643             period_duration = parse_duration(period.get('duration')) or mpd_duration
 1644             period_ms_info = extract_multisegment_info(period, {
 1645                 'start_number': 1,
 1646                 'timescale': 1,
 1647             })
 1648             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
 1649                 if is_drm_protected(adaptation_set):
 1650                     continue
 1651                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
 1652                 for representation in adaptation_set.findall(_add_ns('Representation')):
 1653                     if is_drm_protected(representation):
 1654                         continue
 1655                     representation_attrib = adaptation_set.attrib.copy()
 1656                     representation_attrib.update(representation.attrib)
 1657                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
 1658                     mime_type = representation_attrib['mimeType']
 1659                     content_type = mime_type.split('/')[0]
 1660                     if content_type == 'text':
 1661                         # TODO implement WebVTT downloading
 1662                         pass
 1663                     elif content_type == 'video' or content_type == 'audio':
 1664                         base_url = ''
 1665                         for element in (representation, adaptation_set, period, mpd_doc):
 1666                             base_url_e = element.find(_add_ns('BaseURL'))
 1667                             if base_url_e is not None:
 1668                                 base_url = base_url_e.text + base_url
 1669                                 if re.match(r'^https?://', base_url):
 1670                                     break
 1671                         if mpd_base_url and not re.match(r'^https?://', base_url):
 1672                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
 1673                                 mpd_base_url += '/'
 1674                             base_url = mpd_base_url + base_url
 1675                         representation_id = representation_attrib.get('id')
 1676                         lang = representation_attrib.get('lang')
 1677                         url_el = representation.find(_add_ns('BaseURL'))
 1678                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
 1679                         f = {
 1680                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
 1681                             'url': base_url,
 1682                             'manifest_url': mpd_url,
 1683                             'ext': mimetype2ext(mime_type),
 1684                             'width': int_or_none(representation_attrib.get('width')),
 1685                             'height': int_or_none(representation_attrib.get('height')),
 1686                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
 1687                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
 1688                             'fps': int_or_none(representation_attrib.get('frameRate')),
 1689                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
 1690                             'format_note': 'DASH %s' % content_type,
 1691                             'filesize': filesize,
 1692                         }
 1693                         f.update(parse_codecs(representation_attrib.get('codecs')))
 1694                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
 1695                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
 1696 
 1697                             media_template = representation_ms_info['media_template']
 1698                             media_template = media_template.replace('$RepresentationID$', representation_id)
 1699                             media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
 1700                             media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
 1701                             media_template.replace('$$', '$')
 1702 
 1703                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
 1704                             # can't be used at the same time
 1705                             if '%(Number' in media_template and 's' not in representation_ms_info:
 1706                                 segment_duration = None
 1707                                 if 'total_number' not in representation_ms_info and 'segment_duration':
 1708                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
 1709                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
 1710                                 representation_ms_info['fragments'] = [{
 1711                                     'url': media_template % {
 1712                                         'Number': segment_number,
 1713                                         'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
 1714                                     },
 1715                                     'duration': segment_duration,
 1716                                 } for segment_number in range(
 1717                                     representation_ms_info['start_number'],
 1718                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
 1719                             else:
 1720                                 # $Number*$ or $Time$ in media template with S list available
 1721                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
 1722                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
 1723                                 representation_ms_info['fragments'] = []
 1724                                 segment_time = 0
 1725                                 segment_d = None
 1726                                 segment_number = representation_ms_info['start_number']
 1727 
 1728                                 def add_segment_url():
 1729                                     segment_url = media_template % {
 1730                                         'Time': segment_time,
 1731                                         'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
 1732                                         'Number': segment_number,
 1733                                     }
 1734                                     representation_ms_info['fragments'].append({
 1735                                         'url': segment_url,
 1736                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
 1737                                     })
 1738 
 1739                                 for num, s in enumerate(representation_ms_info['s']):
 1740                                     segment_time = s.get('t') or segment_time
 1741                                     segment_d = s['d']
 1742                                     add_segment_url()
 1743                                     segment_number += 1
 1744                                     for r in range(s.get('r', 0)):
 1745                                         segment_time += segment_d
 1746                                         add_segment_url()
 1747                                         segment_number += 1
 1748                                     segment_time += segment_d
 1749                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
 1750                             # No media template
 1751                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
 1752                             # or any YouTube dashsegments video
 1753                             fragments = []
 1754                             s_num = 0
 1755                             for segment_url in representation_ms_info['segment_urls']:
 1756                                 s = representation_ms_info['s'][s_num]
 1757                                 for r in range(s.get('r', 0) + 1):
 1758                                     fragments.append({
 1759                                         'url': segment_url,
 1760                                         'duration': float_or_none(s['d'], representation_ms_info['timescale']),
 1761                                     })
 1762                             representation_ms_info['fragments'] = fragments
 1763                         # NB: MPD manifest may contain direct URLs to unfragmented media.
 1764                         # No fragments key is present in this case.
 1765                         if 'fragments' in representation_ms_info:
 1766                             f.update({
 1767                                 'fragments': [],
 1768                                 'protocol': 'http_dash_segments',
 1769                             })
 1770                             if 'initialization_url' in representation_ms_info:
 1771                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
 1772                                 if not f.get('url'):
 1773                                     f['url'] = initialization_url
 1774                                 f['fragments'].append({'url': initialization_url})
 1775                             f['fragments'].extend(representation_ms_info['fragments'])
 1776                             for fragment in f['fragments']:
 1777                                 fragment['url'] = urljoin(base_url, fragment['url'])
 1778                         try:
 1779                             existing_format = next(
 1780                                 fo for fo in formats
 1781                                 if fo['format_id'] == representation_id)
 1782                         except StopIteration:
 1783                             full_info = formats_dict.get(representation_id, {}).copy()
 1784                             full_info.update(f)
 1785                             formats.append(full_info)
 1786                         else:
 1787                             existing_format.update(f)
 1788                     else:
 1789                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
 1790         return formats
 1791 
 1792     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
 1793         res = self._download_webpage_handle(
 1794             ism_url, video_id,
 1795             note=note or 'Downloading ISM manifest',
 1796             errnote=errnote or 'Failed to download ISM manifest',
 1797             fatal=fatal)
 1798         if res is False:
 1799             return []
 1800         ism, urlh = res
 1801 
 1802         return self._parse_ism_formats(
 1803             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
 1804 
 1805     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
 1806         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
 1807             return []
 1808 
 1809         duration = int(ism_doc.attrib['Duration'])
 1810         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
 1811 
 1812         formats = []
 1813         for stream in ism_doc.findall('StreamIndex'):
 1814             stream_type = stream.get('Type')
 1815             if stream_type not in ('video', 'audio'):
 1816                 continue
 1817             url_pattern = stream.attrib['Url']
 1818             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
 1819             stream_name = stream.get('Name')
 1820             for track in stream.findall('QualityLevel'):
 1821                 fourcc = track.get('FourCC')
 1822                 # TODO: add support for WVC1 and WMAP
 1823                 if fourcc not in ('H264', 'AVC1', 'AACL'):
 1824                     self.report_warning('%s is not a supported codec' % fourcc)
 1825                     continue
 1826                 tbr = int(track.attrib['Bitrate']) // 1000
 1827                 width = int_or_none(track.get('MaxWidth'))
 1828                 height = int_or_none(track.get('MaxHeight'))
 1829                 sampling_rate = int_or_none(track.get('SamplingRate'))
 1830 
 1831                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
 1832                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
 1833 
 1834                 fragments = []
 1835                 fragment_ctx = {
 1836                     'time': 0,
 1837                 }
 1838                 stream_fragments = stream.findall('c')
 1839                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
 1840                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
 1841                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
 1842                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
 1843                     if not fragment_ctx['duration']:
 1844                         try:
 1845                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
 1846                         except IndexError:
 1847                             next_fragment_time = duration
 1848                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
 1849                     for _ in range(fragment_repeat):
 1850                         fragments.append({
 1851                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
 1852                             'duration': fragment_ctx['duration'] / stream_timescale,
 1853                         })
 1854                         fragment_ctx['time'] += fragment_ctx['duration']
 1855 
 1856                 format_id = []
 1857                 if ism_id:
 1858                     format_id.append(ism_id)
 1859                 if stream_name:
 1860                     format_id.append(stream_name)
 1861                 format_id.append(compat_str(tbr))
 1862 
 1863                 formats.append({
 1864                     'format_id': '-'.join(format_id),
 1865                     'url': ism_url,
 1866                     'manifest_url': ism_url,
 1867                     'ext': 'ismv' if stream_type == 'video' else 'isma',
 1868                     'width': width,
 1869                     'height': height,
 1870                     'tbr': tbr,
 1871                     'asr': sampling_rate,
 1872                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
 1873                     'acodec': 'none' if stream_type == 'video' else fourcc,
 1874                     'protocol': 'ism',
 1875                     'fragments': fragments,
 1876                     '_download_params': {
 1877                         'duration': duration,
 1878                         'timescale': stream_timescale,
 1879                         'width': width or 0,
 1880                         'height': height or 0,
 1881                         'fourcc': fourcc,
 1882                         'codec_private_data': track.get('CodecPrivateData'),
 1883                         'sampling_rate': sampling_rate,
 1884                         'channels': int_or_none(track.get('Channels', 2)),
 1885                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
 1886                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
 1887                     },
 1888                 })
 1889         return formats
 1890 
 1891     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
 1892         def absolute_url(video_url):
 1893             return compat_urlparse.urljoin(base_url, video_url)
 1894 
 1895         def parse_content_type(content_type):
 1896             if not content_type:
 1897                 return {}
 1898             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
 1899             if ctr:
 1900                 mimetype, codecs = ctr.groups()
 1901                 f = parse_codecs(codecs)
 1902                 f['ext'] = mimetype2ext(mimetype)
 1903                 return f
 1904             return {}
 1905 
 1906         def _media_formats(src, cur_media_type):
 1907             full_url = absolute_url(src)
 1908             ext = determine_ext(full_url)
 1909             if ext == 'm3u8':
 1910                 is_plain_url = False
 1911                 formats = self._extract_m3u8_formats(
 1912                     full_url, video_id, ext='mp4',
 1913                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
 1914             elif ext == 'mpd':
 1915                 is_plain_url = False
 1916                 formats = self._extract_mpd_formats(
 1917                     full_url, video_id, mpd_id=mpd_id)
 1918             else:
 1919                 is_plain_url = True
 1920                 formats = [{
 1921                     'url': full_url,
 1922                     'vcodec': 'none' if cur_media_type == 'audio' else None,
 1923                 }]
 1924             return is_plain_url, formats
 1925 
 1926         entries = []
 1927         media_tags = [(media_tag, media_type, '')
 1928                       for media_tag, media_type
 1929                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
 1930         media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
 1931         for media_tag, media_type, media_content in media_tags:
 1932             media_info = {
 1933                 'formats': [],
 1934                 'subtitles': {},
 1935             }
 1936             media_attributes = extract_attributes(media_tag)
 1937             src = media_attributes.get('src')
 1938             if src:
 1939                 _, formats = _media_formats(src, media_type)
 1940                 media_info['formats'].extend(formats)
 1941             media_info['thumbnail'] = media_attributes.get('poster')
 1942             if media_content:
 1943                 for source_tag in re.findall(r'<source[^>]+>', media_content):
 1944                     source_attributes = extract_attributes(source_tag)
 1945                     src = source_attributes.get('src')
 1946                     if not src:
 1947                         continue
 1948                     is_plain_url, formats = _media_formats(src, media_type)
 1949                     if is_plain_url:
 1950                         f = parse_content_type(source_attributes.get('type'))
 1951                         f.update(formats[0])
 1952                         media_info['formats'].append(f)
 1953                     else:
 1954                         media_info['formats'].extend(formats)
 1955                 for track_tag in re.findall(r'<track[^>]+>', media_content):
 1956                     track_attributes = extract_attributes(track_tag)
 1957                     kind = track_attributes.get('kind')
 1958                     if not kind or kind in ('subtitles', 'captions'):
 1959                         src = track_attributes.get('src')
 1960                         if not src:
 1961                             continue
 1962                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
 1963                         media_info['subtitles'].setdefault(lang, []).append({
 1964                             'url': absolute_url(src),
 1965                         })
 1966             if media_info['formats'] or media_info['subtitles']:
 1967                 entries.append(media_info)
 1968         return entries
 1969 
 1970     def _extract_akamai_formats(self, manifest_url, video_id):
 1971         formats = []
 1972         hdcore_sign = 'hdcore=3.7.0'
 1973         f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
 1974         if 'hdcore=' not in f4m_url:
 1975             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
 1976         f4m_formats = self._extract_f4m_formats(
 1977             f4m_url, video_id, f4m_id='hds', fatal=False)
 1978         for entry in f4m_formats:
 1979             entry.update({'extra_param_to_segment_url': hdcore_sign})
 1980         formats.extend(f4m_formats)
 1981         m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
 1982         formats.extend(self._extract_m3u8_formats(
 1983             m3u8_url, video_id, 'mp4', 'm3u8_native',
 1984             m3u8_id='hls', fatal=False))
 1985         return formats
 1986 
 1987     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
 1988         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
 1989         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
 1990         http_base_url = 'http' + url_base
 1991         formats = []
 1992         if 'm3u8' not in skip_protocols:
 1993             formats.extend(self._extract_m3u8_formats(
 1994                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
 1995                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
 1996         if 'f4m' not in skip_protocols:
 1997             formats.extend(self._extract_f4m_formats(
 1998                 http_base_url + '/manifest.f4m',
 1999                 video_id, f4m_id='hds', fatal=False))
 2000         if 'dash' not in skip_protocols:
 2001             formats.extend(self._extract_mpd_formats(
 2002                 http_base_url + '/manifest.mpd',
 2003                 video_id, mpd_id='dash', fatal=False))
 2004         if re.search(r'(?:/smil:|\.smil)', url_base):
 2005             if 'smil' not in skip_protocols:
 2006                 rtmp_formats = self._extract_smil_formats(
 2007                     http_base_url + '/jwplayer.smil',
 2008                     video_id, fatal=False)
 2009                 for rtmp_format in rtmp_formats:
 2010                     rtsp_format = rtmp_format.copy()
 2011                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
 2012                     del rtsp_format['play_path']
 2013                     del rtsp_format['ext']
 2014                     rtsp_format.update({
 2015                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
 2016                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
 2017                         'protocol': 'rtsp',
 2018                     })
 2019                     formats.extend([rtmp_format, rtsp_format])
 2020         else:
 2021             for protocol in ('rtmp', 'rtsp'):
 2022                 if protocol not in skip_protocols:
 2023                     formats.append({
 2024                         'url': protocol + url_base,
 2025                         'format_id': protocol,
 2026                         'protocol': protocol,
 2027                     })
 2028         return formats
 2029 
 2030     def _live_title(self, name):
 2031         """ Generate the title for a live video """
 2032         now = datetime.datetime.now()
 2033         now_str = now.strftime('%Y-%m-%d %H:%M')
 2034         return name + ' ' + now_str
 2035 
 2036     def _int(self, v, name, fatal=False, **kwargs):
 2037         res = int_or_none(v, **kwargs)
 2038         if 'get_attr' in kwargs:
 2039             print(getattr(v, kwargs['get_attr']))
 2040         if res is None:
 2041             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 2042             if fatal:
 2043                 raise ExtractorError(msg)
 2044             else:
 2045                 self._downloader.report_warning(msg)
 2046         return res
 2047 
 2048     def _float(self, v, name, fatal=False, **kwargs):
 2049         res = float_or_none(v, **kwargs)
 2050         if res is None:
 2051             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 2052             if fatal:
 2053                 raise ExtractorError(msg)
 2054             else:
 2055                 self._downloader.report_warning(msg)
 2056         return res
 2057 
 2058     def _set_cookie(self, domain, name, value, expire_time=None):
 2059         cookie = compat_cookiejar.Cookie(
 2060             0, name, value, None, None, domain, None,
 2061             None, '/', True, False, expire_time, '', None, None, None)
 2062         self._downloader.cookiejar.set_cookie(cookie)
 2063 
 2064     def _get_cookies(self, url):
 2065         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
 2066         req = sanitized_Request(url)
 2067         self._downloader.cookiejar.add_cookie_header(req)
 2068         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
 2069 
 2070     def get_testcases(self, include_onlymatching=False):
 2071         t = getattr(self, '_TEST', None)
 2072         if t:
 2073             assert not hasattr(self, '_TESTS'), \
 2074                 '%s has _TEST and _TESTS' % type(self).__name__
 2075             tests = [t]
 2076         else:
 2077             tests = getattr(self, '_TESTS', [])
 2078         for t in tests:
 2079             if not include_onlymatching and t.get('only_matching', False):
 2080                 continue
 2081             t['name'] = type(self).__name__[:-len('IE')]
 2082             yield t
 2083 
 2084     def is_suitable(self, age_limit):
 2085         """ Test whether the extractor is generally suitable for the given
 2086         age limit (i.e. pornographic sites are not, all others usually are) """
 2087 
 2088         any_restricted = False
 2089         for tc in self.get_testcases(include_onlymatching=False):
 2090             if tc.get('playlist', []):
 2091                 tc = tc['playlist'][0]
 2092             is_restricted = age_restricted(
 2093                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 2094             if not is_restricted:
 2095                 return True
 2096             any_restricted = any_restricted or is_restricted
 2097         return not any_restricted
 2098 
 2099     def extract_subtitles(self, *args, **kwargs):
 2100         if (self._downloader.params.get('writesubtitles', False) or
 2101                 self._downloader.params.get('listsubtitles')):
 2102             return self._get_subtitles(*args, **kwargs)
 2103         return {}
 2104 
 2105     def _get_subtitles(self, *args, **kwargs):
 2106         raise NotImplementedError('This method must be implemented by subclasses')
 2107 
 2108     @staticmethod
 2109     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
 2110         """ Merge subtitle items for one language. Items with duplicated URLs
 2111         will be dropped. """
 2112         list1_urls = set([item['url'] for item in subtitle_list1])
 2113         ret = list(subtitle_list1)
 2114         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
 2115         return ret
 2116 
 2117     @classmethod
 2118     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
 2119         """ Merge two subtitle dictionaries, language by language. """
 2120         ret = dict(subtitle_dict1)
 2121         for lang in subtitle_dict2:
 2122             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
 2123         return ret
 2124 
 2125     def extract_automatic_captions(self, *args, **kwargs):
 2126         if (self._downloader.params.get('writeautomaticsub', False) or
 2127                 self._downloader.params.get('listsubtitles')):
 2128             return self._get_automatic_captions(*args, **kwargs)
 2129         return {}
 2130 
 2131     def _get_automatic_captions(self, *args, **kwargs):
 2132         raise NotImplementedError('This method must be implemented by subclasses')
 2133 
 2134     def mark_watched(self, *args, **kwargs):
 2135         if (self._downloader.params.get('mark_watched', False) and
 2136                 (self._get_login_info()[0] is not None or
 2137                     self._downloader.params.get('cookiefile') is not None)):
 2138             self._mark_watched(*args, **kwargs)
 2139 
 2140     def _mark_watched(self, *args, **kwargs):
 2141         raise NotImplementedError('This method must be implemented by subclasses')
 2142 
 2143     def geo_verification_headers(self):
 2144         headers = {}
 2145         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
 2146         if geo_verification_proxy:
 2147             headers['Ytdl-request-proxy'] = geo_verification_proxy
 2148         return headers
 2149 
 2150     def _generic_id(self, url):
 2151         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 2152 
 2153     def _generic_title(self, url):
 2154         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
 2155 
 2156 
 2157 class SearchInfoExtractor(InfoExtractor):
 2158     """
 2159     Base class for paged search queries extractors.
 2160     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
 2161     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 2162     """
 2163 
 2164     @classmethod
 2165     def _make_valid_url(cls):
 2166         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 2167 
 2168     @classmethod
 2169     def suitable(cls, url):
 2170         return re.match(cls._make_valid_url(), url) is not None
 2171 
 2172     def _real_extract(self, query):
 2173         mobj = re.match(self._make_valid_url(), query)
 2174         if mobj is None:
 2175             raise ExtractorError('Invalid search query "%s"' % query)
 2176 
 2177         prefix = mobj.group('prefix')
 2178         query = mobj.group('query')
 2179         if prefix == '':
 2180             return self._get_n_results(query, 1)
 2181         elif prefix == 'all':
 2182             return self._get_n_results(query, self._MAX_RESULTS)
 2183         else:
 2184             n = int(prefix)
 2185             if n <= 0:
 2186                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 2187             elif n > self._MAX_RESULTS:
 2188                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 2189                 n = self._MAX_RESULTS
 2190             return self._get_n_results(query, n)
 2191 
 2192     def _get_n_results(self, query, n):
 2193         """Get a specified number of results for a query"""
 2194         raise NotImplementedError('This method must be implemented by subclasses')
 2195 
 2196     @property
 2197     def SEARCH_KEY(self):
 2198         return self._SEARCH_KEY

Generated by cgit