summaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
blob: a0a796d7b406f799462d2c6395a8f7b7d5aa7a10 (plain)
    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import base64
    5 import datetime
    6 import hashlib
    7 import json
    8 import netrc
    9 import os
   10 import random
   11 import re
   12 import socket
   13 import ssl
   14 import sys
   15 import time
   16 import math
   17 
   18 from ..compat import (
   19     compat_cookiejar_Cookie,
   20     compat_cookies_SimpleCookie,
   21     compat_etree_Element,
   22     compat_etree_fromstring,
   23     compat_getpass,
   24     compat_integer_types,
   25     compat_http_client,
   26     compat_os_name,
   27     compat_str,
   28     compat_urllib_error,
   29     compat_urllib_parse_unquote,
   30     compat_urllib_parse_urlencode,
   31     compat_urllib_request,
   32     compat_urlparse,
   33     compat_xml_parse_error,
   34 )
   35 from ..downloader.f4m import (
   36     get_base_url,
   37     remove_encrypted_media,
   38 )
   39 from ..utils import (
   40     NO_DEFAULT,
   41     age_restricted,
   42     base_url,
   43     bug_reports_message,
   44     clean_html,
   45     compiled_regex_type,
   46     determine_ext,
   47     determine_protocol,
   48     dict_get,
   49     error_to_compat_str,
   50     ExtractorError,
   51     extract_attributes,
   52     fix_xml_ampersands,
   53     float_or_none,
   54     GeoRestrictedError,
   55     GeoUtils,
   56     int_or_none,
   57     js_to_json,
   58     JSON_LD_RE,
   59     mimetype2ext,
   60     orderedSet,
   61     parse_bitrate,
   62     parse_codecs,
   63     parse_duration,
   64     parse_iso8601,
   65     parse_m3u8_attributes,
   66     parse_resolution,
   67     RegexNotFoundError,
   68     sanitized_Request,
   69     sanitize_filename,
   70     str_or_none,
   71     str_to_int,
   72     strip_or_none,
   73     try_get,
   74     unescapeHTML,
   75     unified_strdate,
   76     unified_timestamp,
   77     update_Request,
   78     update_url_query,
   79     urljoin,
   80     url_basename,
   81     url_or_none,
   82     xpath_element,
   83     xpath_text,
   84     xpath_with_ns,
   85 )
   86 
   87 
   88 class InfoExtractor(object):
   89     """Information Extractor class.
   90 
   91     Information extractors are the classes that, given a URL, extract
   92     information about the video (or videos) the URL refers to. This
   93     information includes the real video URL, the video title, author and
   94     others. The information is stored in a dictionary which is then
   95     passed to the YoutubeDL. The YoutubeDL processes this
   96     information possibly downloading the video to the file system, among
   97     other possible outcomes.
   98 
   99     The type field determines the type of the result.
  100     By far the most common value (and the default if _type is missing) is
  101     "video", which indicates a single video.
  102 
  103     For a video, the dictionaries must include the following fields:
  104 
  105     id:             Video identifier.
  106     title:          Video title, unescaped.
  107 
  108     Additionally, it must contain either a formats entry or a url one:
  109 
  110     formats:        A list of dictionaries for each format available, ordered
  111                     from worst to best quality.
  112 
  113                     Potential fields:
  114                     * url        The mandatory URL representing the media:
  115                                    for plain file media - HTTP URL of this file,
  116                                    for RTMP - RTMP URL,
  117                                    for HLS - URL of the M3U8 media playlist,
  118                                    for HDS - URL of the F4M manifest,
  119                                    for DASH
  120                                      - HTTP URL to plain file media (in case of
  121                                        unfragmented media)
  122                                      - URL of the MPD manifest or base URL
  123                                        representing the media if MPD manifest
  124                                        is parsed from a string (in case of
  125                                        fragmented media)
  126                                    for MSS - URL of the ISM manifest.
  127                     * manifest_url
  128                                  The URL of the manifest file in case of
  129                                  fragmented media:
  130                                    for HLS - URL of the M3U8 master playlist,
  131                                    for HDS - URL of the F4M manifest,
  132                                    for DASH - URL of the MPD manifest,
  133                                    for MSS - URL of the ISM manifest.
  134                     * ext        Will be calculated from URL if missing
  135                     * format     A human-readable description of the format
  136                                  ("mp4 container with h264/opus").
  137                                  Calculated from the format_id, width, height.
  138                                  and format_note fields if missing.
  139                     * format_id  A short description of the format
  140                                  ("mp4_h264_opus" or "19").
  141                                 Technically optional, but strongly recommended.
  142                     * format_note Additional info about the format
  143                                  ("3D" or "DASH video")
  144                     * width      Width of the video, if known
  145                     * height     Height of the video, if known
  146                     * resolution Textual description of width and height
  147                     * tbr        Average bitrate of audio and video in KBit/s
  148                     * abr        Average audio bitrate in KBit/s
  149                     * acodec     Name of the audio codec in use
  150                     * asr        Audio sampling rate in Hertz
  151                     * vbr        Average video bitrate in KBit/s
  152                     * fps        Frame rate
  153                     * vcodec     Name of the video codec in use
  154                     * container  Name of the container format
  155                     * filesize   The number of bytes, if known in advance
  156                     * filesize_approx  An estimate for the number of bytes
  157                     * player_url SWF Player URL (used for rtmpdump).
  158                     * protocol   The protocol that will be used for the actual
  159                                  download, lower-case.
  160                                  "http", "https", "rtsp", "rtmp", "rtmpe",
  161                                  "m3u8", "m3u8_native" or "http_dash_segments".
  162                     * fragment_base_url
  163                                  Base URL for fragments. Each fragment's path
  164                                  value (if present) will be relative to
  165                                  this URL.
  166                     * fragments  A list of fragments of a fragmented media.
  167                                  Each fragment entry must contain either an url
  168                                  or a path. If an url is present it should be
  169                                  considered by a client. Otherwise both path and
  170                                  fragment_base_url must be present. Here is
  171                                  the list of all potential fields:
  172                                  * "url" - fragment's URL
  173                                  * "path" - fragment's path relative to
  174                                             fragment_base_url
  175                                  * "duration" (optional, int or float)
  176                                  * "filesize" (optional, int)
  177                     * preference Order number of this format. If this field is
  178                                  present and not None, the formats get sorted
  179                                  by this field, regardless of all other values.
  180                                  -1 for default (order by other properties),
  181                                  -2 or smaller for less than default.
  182                                  < -1000 to hide the format (if there is
  183                                     another one which is strictly better)
  184                     * language   Language code, e.g. "de" or "en-US".
  185                     * language_preference  Is this in the language mentioned in
  186                                  the URL?
  187                                  10 if it's what the URL is about,
  188                                  -1 for default (don't know),
  189                                  -10 otherwise, other values reserved for now.
  190                     * quality    Order number of the video quality of this
  191                                  format, irrespective of the file format.
  192                                  -1 for default (order by other properties),
  193                                  -2 or smaller for less than default.
  194                     * source_preference  Order number for this video source
  195                                   (quality takes higher priority)
  196                                  -1 for default (order by other properties),
  197                                  -2 or smaller for less than default.
  198                     * http_headers  A dictionary of additional HTTP headers
  199                                  to add to the request.
  200                     * stretched_ratio  If given and not 1, indicates that the
  201                                  video's pixels are not square.
  202                                  width : height ratio as float.
  203                     * no_resume  The server does not support resuming the
  204                                  (HTTP or RTMP) download. Boolean.
  205                     * downloader_options  A dictionary of downloader options as
  206                                  described in FileDownloader
  207 
  208     url:            Final video URL.
  209     ext:            Video filename extension.
  210     format:         The video format, defaults to ext (used for --get-format)
  211     player_url:     SWF Player URL (used for rtmpdump).
  212 
  213     The following fields are optional:
  214 
  215     alt_title:      A secondary title of the video.
  216     display_id      An alternative identifier for the video, not necessarily
  217                     unique, but available before title. Typically, id is
  218                     something like "4234987", title "Dancing naked mole rats",
  219                     and display_id "dancing-naked-mole-rats"
  220     thumbnails:     A list of dictionaries, with the following entries:
  221                         * "id" (optional, string) - Thumbnail format ID
  222                         * "url"
  223                         * "preference" (optional, int) - quality of the image
  224                         * "width" (optional, int)
  225                         * "height" (optional, int)
  226                         * "resolution" (optional, string "{width}x{height}",
  227                                         deprecated)
  228                         * "filesize" (optional, int)
  229     thumbnail:      Full URL to a video thumbnail image.
  230     description:    Full video description.
  231     uploader:       Full name of the video uploader.
  232     license:        License name the video is licensed under.
  233     creator:        The creator of the video.
  234     release_timestamp: UNIX timestamp of the moment the video was released.
  235     release_date:   The date (YYYYMMDD) when the video was released.
  236     timestamp:      UNIX timestamp of the moment the video became available
  237                     (uploaded).
  238     upload_date:    Video upload date (YYYYMMDD).
  239                     If not explicitly set, calculated from timestamp.
  240     uploader_id:    Nickname or id of the video uploader.
  241     uploader_url:   Full URL to a personal webpage of the video uploader.
  242     channel:        Full name of the channel the video is uploaded on.
  243                     Note that channel fields may or may not repeat uploader
  244                     fields. This depends on a particular extractor.
  245     channel_id:     Id of the channel.
  246     channel_url:    Full URL to a channel webpage.
  247     location:       Physical location where the video was filmed.
  248     subtitles:      The available subtitles as a dictionary in the format
  249                     {tag: subformats}. "tag" is usually a language code, and
  250                     "subformats" is a list sorted from lower to higher
  251                     preference, each element is a dictionary with the "ext"
  252                     entry and one of:
  253                         * "data": The subtitles file contents
  254                         * "url": A URL pointing to the subtitles file
  255                     "ext" will be calculated from URL if missing
  256     automatic_captions: Like 'subtitles', used by the YoutubeIE for
  257                     automatically generated captions
  258     duration:       Length of the video in seconds, as an integer or float.
  259     view_count:     How many users have watched the video on the platform.
  260     like_count:     Number of positive ratings of the video
  261     dislike_count:  Number of negative ratings of the video
  262     repost_count:   Number of reposts of the video
  263     average_rating: Average rating give by users, the scale used depends on the webpage
  264     comment_count:  Number of comments on the video
  265     comments:       A list of comments, each with one or more of the following
  266                     properties (all but one of text or html optional):
  267                         * "author" - human-readable name of the comment author
  268                         * "author_id" - user ID of the comment author
  269                         * "id" - Comment ID
  270                         * "html" - Comment as HTML
  271                         * "text" - Plain text of the comment
  272                         * "timestamp" - UNIX timestamp of comment
  273                         * "parent" - ID of the comment this one is replying to.
  274                                      Set to "root" to indicate that this is a
  275                                      comment to the original video.
  276     age_limit:      Age restriction for the video, as an integer (years)
  277     webpage_url:    The URL to the video webpage, if given to youtube-dl it
  278                     should allow to get the same result again. (It will be set
  279                     by YoutubeDL if it's missing)
  280     categories:     A list of categories that the video falls in, for example
  281                     ["Sports", "Berlin"]
  282     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
  283     is_live:        True, False, or None (=unknown). Whether this video is a
  284                     live stream that goes on instead of a fixed-length video.
  285     start_time:     Time in seconds where the reproduction should start, as
  286                     specified in the URL.
  287     end_time:       Time in seconds where the reproduction should end, as
  288                     specified in the URL.
  289     chapters:       A list of dictionaries, with the following entries:
  290                         * "start_time" - The start time of the chapter in seconds
  291                         * "end_time" - The end time of the chapter in seconds
  292                         * "title" (optional, string)
  293 
  294     The following fields should only be used when the video belongs to some logical
  295     chapter or section:
  296 
  297     chapter:        Name or title of the chapter the video belongs to.
  298     chapter_number: Number of the chapter the video belongs to, as an integer.
  299     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
  300 
  301     The following fields should only be used when the video is an episode of some
  302     series, programme or podcast:
  303 
  304     series:         Title of the series or programme the video episode belongs to.
  305     season:         Title of the season the video episode belongs to.
  306     season_number:  Number of the season the video episode belongs to, as an integer.
  307     season_id:      Id of the season the video episode belongs to, as a unicode string.
  308     episode:        Title of the video episode. Unlike mandatory video title field,
  309                     this field should denote the exact title of the video episode
  310                     without any kind of decoration.
  311     episode_number: Number of the video episode within a season, as an integer.
  312     episode_id:     Id of the video episode, as a unicode string.
  313 
  314     The following fields should only be used when the media is a track or a part of
  315     a music album:
  316 
  317     track:          Title of the track.
  318     track_number:   Number of the track within an album or a disc, as an integer.
  319     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
  320                     as a unicode string.
  321     artist:         Artist(s) of the track.
  322     genre:          Genre(s) of the track.
  323     album:          Title of the album the track belongs to.
  324     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
  325     album_artist:   List of all artists appeared on the album (e.g.
  326                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
  327                     and compilations).
  328     disc_number:    Number of the disc or other physical medium the track belongs to,
  329                     as an integer.
  330     release_year:   Year (YYYY) when the album was released.
  331 
  332     Unless mentioned otherwise, the fields should be Unicode strings.
  333 
  334     Unless mentioned otherwise, None is equivalent to absence of information.
  335 
  336 
  337     _type "playlist" indicates multiple videos.
  338     There must be a key "entries", which is a list, an iterable, or a PagedList
  339     object, each element of which is a valid dictionary by this specification.
  340 
  341     Additionally, playlists can have "id", "title", "description", "uploader",
  342     "uploader_id", "uploader_url", "duration" attributes with the same semantics
  343     as videos (see above).
  344 
  345 
  346     _type "multi_video" indicates that there are multiple videos that
  347     form a single show, for examples multiple acts of an opera or TV episode.
  348     It must have an entries key like a playlist and contain all the keys
  349     required for a video at the same time.
  350 
  351 
  352     _type "url" indicates that the video must be extracted from another
  353     location, possibly by a different extractor. Its only required key is:
  354     "url" - the next URL to extract.
  355     The key "ie_key" can be set to the class name (minus the trailing "IE",
  356     e.g. "Youtube") if the extractor class is known in advance.
  357     Additionally, the dictionary may have any properties of the resolved entity
  358     known in advance, for example "title" if the title of the referred video is
  359     known ahead of time.
  360 
  361 
  362     _type "url_transparent" entities have the same specification as "url", but
  363     indicate that the given additional information is more precise than the one
  364     associated with the resolved URL.
  365     This is useful when a site employs a video service that hosts the video and
  366     its technical metadata, but that video service does not embed a useful
  367     title, description etc.
  368 
  369 
  370     Subclasses of this one should re-define the _real_initialize() and
  371     _real_extract() methods and define a _VALID_URL regexp.
  372     Probably, they should also be added to the list of extractors.
  373 
  374     _GEO_BYPASS attribute may be set to False in order to disable
  375     geo restriction bypass mechanisms for a particular extractor.
  376     Though it won't disable explicit geo restriction bypass based on
  377     country code provided with geo_bypass_country.
  378 
  379     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
  380     countries for this extractor. One of these countries will be used by
  381     geo restriction bypass mechanism right away in order to bypass
  382     geo restriction, of course, if the mechanism is not disabled.
  383 
  384     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
  385     IP blocks in CIDR notation for this extractor. One of these IP blocks
  386     will be used by geo restriction bypass mechanism similarly
  387     to _GEO_COUNTRIES.
  388 
  389     Finally, the _WORKING attribute should be set to False for broken IEs
  390     in order to warn the users and skip the tests.
  391     """
  392 
  393     _ready = False
  394     _downloader = None
  395     _x_forwarded_for_ip = None
  396     _GEO_BYPASS = True
  397     _GEO_COUNTRIES = None
  398     _GEO_IP_BLOCKS = None
  399     _WORKING = True
  400 
  401     def __init__(self, downloader=None):
  402         """Constructor. Receives an optional downloader."""
  403         self._ready = False
  404         self._x_forwarded_for_ip = None
  405         self.set_downloader(downloader)
  406 
  407     @classmethod
  408     def suitable(cls, url):
  409         """Receives a URL and returns True if suitable for this IE."""
  410 
  411         # This does not use has/getattr intentionally - we want to know whether
  412         # we have cached the regexp for *this* class, whereas getattr would also
  413         # match the superclass
  414         if '_VALID_URL_RE' not in cls.__dict__:
  415             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
  416         return cls._VALID_URL_RE.match(url) is not None
  417 
  418     @classmethod
  419     def _match_id(cls, url):
  420         if '_VALID_URL_RE' not in cls.__dict__:
  421             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
  422         m = cls._VALID_URL_RE.match(url)
  423         assert m
  424         return compat_str(m.group('id'))
  425 
  426     @classmethod
  427     def working(cls):
  428         """Getter method for _WORKING."""
  429         return cls._WORKING
  430 
  431     def initialize(self):
  432         """Initializes an instance (authentication, etc)."""
  433         self._initialize_geo_bypass({
  434             'countries': self._GEO_COUNTRIES,
  435             'ip_blocks': self._GEO_IP_BLOCKS,
  436         })
  437         if not self._ready:
  438             self._real_initialize()
  439             self._ready = True
  440 
  441     def _initialize_geo_bypass(self, geo_bypass_context):
  442         """
  443         Initialize geo restriction bypass mechanism.
  444 
  445         This method is used to initialize geo bypass mechanism based on faking
  446         X-Forwarded-For HTTP header. A random country from provided country list
  447         is selected and a random IP belonging to this country is generated. This
  448         IP will be passed as X-Forwarded-For HTTP header in all subsequent
  449         HTTP requests.
  450 
  451         This method will be used for initial geo bypass mechanism initialization
  452         during the instance initialization with _GEO_COUNTRIES and
  453         _GEO_IP_BLOCKS.
  454 
  455         You may also manually call it from extractor's code if geo bypass
  456         information is not available beforehand (e.g. obtained during
  457         extraction) or due to some other reason. In this case you should pass
  458         this information in geo bypass context passed as first argument. It may
  459         contain following fields:
  460 
  461         countries:  List of geo unrestricted countries (similar
  462                     to _GEO_COUNTRIES)
  463         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
  464                     (similar to _GEO_IP_BLOCKS)
  465 
  466         """
  467         if not self._x_forwarded_for_ip:
  468 
  469             # Geo bypass mechanism is explicitly disabled by user
  470             if not self._downloader.params.get('geo_bypass', True):
  471                 return
  472 
  473             if not geo_bypass_context:
  474                 geo_bypass_context = {}
  475 
  476             # Backward compatibility: previously _initialize_geo_bypass
  477             # expected a list of countries, some 3rd party code may still use
  478             # it this way
  479             if isinstance(geo_bypass_context, (list, tuple)):
  480                 geo_bypass_context = {
  481                     'countries': geo_bypass_context,
  482                 }
  483 
  484             # The whole point of geo bypass mechanism is to fake IP
  485             # as X-Forwarded-For HTTP header based on some IP block or
  486             # country code.
  487 
  488             # Path 1: bypassing based on IP block in CIDR notation
  489 
  490             # Explicit IP block specified by user, use it right away
  491             # regardless of whether extractor is geo bypassable or not
  492             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
  493 
  494             # Otherwise use random IP block from geo bypass context but only
  495             # if extractor is known as geo bypassable
  496             if not ip_block:
  497                 ip_blocks = geo_bypass_context.get('ip_blocks')
  498                 if self._GEO_BYPASS and ip_blocks:
  499                     ip_block = random.choice(ip_blocks)
  500 
  501             if ip_block:
  502                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
  503                 if self._downloader.params.get('verbose', False):
  504                     self._downloader.to_screen(
  505                         '[debug] Using fake IP %s as X-Forwarded-For.'
  506                         % self._x_forwarded_for_ip)
  507                 return
  508 
  509             # Path 2: bypassing based on country code
  510 
  511             # Explicit country code specified by user, use it right away
  512             # regardless of whether extractor is geo bypassable or not
  513             country = self._downloader.params.get('geo_bypass_country', None)
  514 
  515             # Otherwise use random country code from geo bypass context but
  516             # only if extractor is known as geo bypassable
  517             if not country:
  518                 countries = geo_bypass_context.get('countries')
  519                 if self._GEO_BYPASS and countries:
  520                     country = random.choice(countries)
  521 
  522             if country:
  523                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
  524                 if self._downloader.params.get('verbose', False):
  525                     self._downloader.to_screen(
  526                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
  527                         % (self._x_forwarded_for_ip, country.upper()))
  528 
  529     def extract(self, url):
  530         """Extracts URL information and returns it in list of dicts."""
  531         try:
  532             for _ in range(2):
  533                 try:
  534                     self.initialize()
  535                     ie_result = self._real_extract(url)
  536                     if self._x_forwarded_for_ip:
  537                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
  538                     return ie_result
  539                 except GeoRestrictedError as e:
  540                     if self.__maybe_fake_ip_and_retry(e.countries):
  541                         continue
  542                     raise
  543         except ExtractorError:
  544             raise
  545         except compat_http_client.IncompleteRead as e:
  546             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
  547         except (KeyError, StopIteration) as e:
  548             raise ExtractorError('An extractor error has occurred.', cause=e)
  549 
  550     def __maybe_fake_ip_and_retry(self, countries):
  551         if (not self._downloader.params.get('geo_bypass_country', None)
  552                 and self._GEO_BYPASS
  553                 and self._downloader.params.get('geo_bypass', True)
  554                 and not self._x_forwarded_for_ip
  555                 and countries):
  556             country_code = random.choice(countries)
  557             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
  558             if self._x_forwarded_for_ip:
  559                 self.report_warning(
  560                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
  561                     % (self._x_forwarded_for_ip, country_code.upper()))
  562                 return True
  563         return False
  564 
  565     def set_downloader(self, downloader):
  566         """Sets the downloader for this IE."""
  567         self._downloader = downloader
  568 
  569     def _real_initialize(self):
  570         """Real initialization process. Redefine in subclasses."""
  571         pass
  572 
  573     def _real_extract(self, url):
  574         """Real extraction process. Redefine in subclasses."""
  575         pass
  576 
  577     @classmethod
  578     def ie_key(cls):
  579         """A string for getting the InfoExtractor with get_info_extractor"""
  580         return compat_str(cls.__name__[:-2])
  581 
  582     @property
  583     def IE_NAME(self):
  584         return compat_str(type(self).__name__[:-2])
  585 
  586     @staticmethod
  587     def __can_accept_status_code(err, expected_status):
  588         assert isinstance(err, compat_urllib_error.HTTPError)
  589         if expected_status is None:
  590             return False
  591         if isinstance(expected_status, compat_integer_types):
  592             return err.code == expected_status
  593         elif isinstance(expected_status, (list, tuple)):
  594             return err.code in expected_status
  595         elif callable(expected_status):
  596             return expected_status(err.code) is True
  597         else:
  598             assert False
  599 
  600     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
  601         """
  602         Return the response handle.
  603 
  604         See _download_webpage docstring for arguments specification.
  605         """
  606         if note is None:
  607             self.report_download_webpage(video_id)
  608         elif note is not False:
  609             if video_id is None:
  610                 self.to_screen('%s' % (note,))
  611             else:
  612                 self.to_screen('%s: %s' % (video_id, note))
  613 
  614         # Some sites check X-Forwarded-For HTTP header in order to figure out
  615         # the origin of the client behind proxy. This allows bypassing geo
  616         # restriction by faking this header's value to IP that belongs to some
  617         # geo unrestricted country. We will do so once we encounter any
  618         # geo restriction error.
  619         if self._x_forwarded_for_ip:
  620             if 'X-Forwarded-For' not in headers:
  621                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
  622 
  623         if isinstance(url_or_request, compat_urllib_request.Request):
  624             url_or_request = update_Request(
  625                 url_or_request, data=data, headers=headers, query=query)
  626         else:
  627             if query:
  628                 url_or_request = update_url_query(url_or_request, query)
  629             if data is not None or headers:
  630                 url_or_request = sanitized_Request(url_or_request, data, headers)
  631         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
  632         if hasattr(ssl, 'CertificateError'):
  633             exceptions.append(ssl.CertificateError)
  634         try:
  635             return self._downloader.urlopen(url_or_request)
  636         except tuple(exceptions) as err:
  637             if isinstance(err, compat_urllib_error.HTTPError):
  638                 if self.__can_accept_status_code(err, expected_status):
  639                     # Retain reference to error to prevent file object from
  640                     # being closed before it can be read. Works around the
  641                     # effects of <https://bugs.python.org/issue15002>
  642                     # introduced in Python 3.4.1.
  643                     err.fp._error = err
  644                     return err.fp
  645 
  646             if errnote is False:
  647                 return False
  648             if errnote is None:
  649                 errnote = 'Unable to download webpage'
  650 
  651             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
  652             if fatal:
  653                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
  654             else:
  655                 self._downloader.report_warning(errmsg)
  656                 return False
  657 
  658     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
  659         """
  660         Return a tuple (page content as string, URL handle).
  661 
  662         See _download_webpage docstring for arguments specification.
  663         """
  664         # Strip hashes from the URL (#1038)
  665         if isinstance(url_or_request, (compat_str, str)):
  666             url_or_request = url_or_request.partition('#')[0]
  667 
  668         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
  669         if urlh is False:
  670             assert not fatal
  671             return False
  672         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
  673         return (content, urlh)
  674 
  675     @staticmethod
  676     def _guess_encoding_from_content(content_type, webpage_bytes):
  677         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
  678         if m:
  679             encoding = m.group(1)
  680         else:
  681             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
  682                           webpage_bytes[:1024])
  683             if m:
  684                 encoding = m.group(1).decode('ascii')
  685             elif webpage_bytes.startswith(b'\xff\xfe'):
  686                 encoding = 'utf-16'
  687             else:
  688                 encoding = 'utf-8'
  689 
  690         return encoding
  691 
  692     def __check_blocked(self, content):
  693         first_block = content[:512]
  694         if ('<title>Access to this site is blocked</title>' in content
  695                 and 'Websense' in first_block):
  696             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
  697             blocked_iframe = self._html_search_regex(
  698                 r'<iframe src="([^"]+)"', content,
  699                 'Websense information URL', default=None)
  700             if blocked_iframe:
  701                 msg += ' Visit %s for more details' % blocked_iframe
  702             raise ExtractorError(msg, expected=True)
  703         if '<title>The URL you requested has been blocked</title>' in first_block:
  704             msg = (
  705                 'Access to this webpage has been blocked by Indian censorship. '
  706                 'Use a VPN or proxy server (with --proxy) to route around it.')
  707             block_msg = self._html_search_regex(
  708                 r'</h1><p>(.*?)</p>',
  709                 content, 'block message', default=None)
  710             if block_msg:
  711                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
  712             raise ExtractorError(msg, expected=True)
  713         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
  714                 and 'blocklist.rkn.gov.ru' in content):
  715             raise ExtractorError(
  716                 'Access to this webpage has been blocked by decision of the Russian government. '
  717                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
  718                 expected=True)
  719 
  720     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
  721         content_type = urlh.headers.get('Content-Type', '')
  722         webpage_bytes = urlh.read()
  723         if prefix is not None:
  724             webpage_bytes = prefix + webpage_bytes
  725         if not encoding:
  726             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
  727         if self._downloader.params.get('dump_intermediate_pages', False):
  728             self.to_screen('Dumping request to ' + urlh.geturl())
  729             dump = base64.b64encode(webpage_bytes).decode('ascii')
  730             self._downloader.to_screen(dump)
  731         if self._downloader.params.get('write_pages', False):
  732             basen = '%s_%s' % (video_id, urlh.geturl())
  733             if len(basen) > 240:
  734                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
  735                 basen = basen[:240 - len(h)] + h
  736             raw_filename = basen + '.dump'
  737             filename = sanitize_filename(raw_filename, restricted=True)
  738             self.to_screen('Saving request to ' + filename)
  739             # Working around MAX_PATH limitation on Windows (see
  740             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
  741             if compat_os_name == 'nt':
  742                 absfilepath = os.path.abspath(filename)
  743                 if len(absfilepath) > 259:
  744                     filename = '\\\\?\\' + absfilepath
  745             with open(filename, 'wb') as outf:
  746                 outf.write(webpage_bytes)
  747 
  748         try:
  749             content = webpage_bytes.decode(encoding, 'replace')
  750         except LookupError:
  751             content = webpage_bytes.decode('utf-8', 'replace')
  752 
  753         self.__check_blocked(content)
  754 
  755         return content
  756 
  757     def _download_webpage(
  758             self, url_or_request, video_id, note=None, errnote=None,
  759             fatal=True, tries=1, timeout=5, encoding=None, data=None,
  760             headers={}, query={}, expected_status=None):
  761         """
  762         Return the data of the page as a string.
  763 
  764         Arguments:
  765         url_or_request -- plain text URL as a string or
  766             a compat_urllib_request.Requestobject
  767         video_id -- Video/playlist/item identifier (string)
  768 
  769         Keyword arguments:
  770         note -- note printed before downloading (string)
  771         errnote -- note printed in case of an error (string)
  772         fatal -- flag denoting whether error should be considered fatal,
  773             i.e. whether it should cause ExtractionError to be raised,
  774             otherwise a warning will be reported and extraction continued
  775         tries -- number of tries
  776         timeout -- sleep interval between tries
  777         encoding -- encoding for a page content decoding, guessed automatically
  778             when not explicitly specified
  779         data -- POST data (bytes)
  780         headers -- HTTP headers (dict)
  781         query -- URL query (dict)
  782         expected_status -- allows to accept failed HTTP requests (non 2xx
  783             status code) by explicitly specifying a set of accepted status
  784             codes. Can be any of the following entities:
  785                 - an integer type specifying an exact failed status code to
  786                   accept
  787                 - a list or a tuple of integer types specifying a list of
  788                   failed status codes to accept
  789                 - a callable accepting an actual failed status code and
  790                   returning True if it should be accepted
  791             Note that this argument does not affect success status codes (2xx)
  792             which are always accepted.
  793         """
  794 
  795         success = False
  796         try_count = 0
  797         while success is False:
  798             try:
  799                 res = self._download_webpage_handle(
  800                     url_or_request, video_id, note, errnote, fatal,
  801                     encoding=encoding, data=data, headers=headers, query=query,
  802                     expected_status=expected_status)
  803                 success = True
  804             except compat_http_client.IncompleteRead as e:
  805                 try_count += 1
  806                 if try_count >= tries:
  807                     raise e
  808                 self._sleep(timeout, video_id)
  809         if res is False:
  810             return res
  811         else:
  812             content, _ = res
  813             return content
  814 
  815     def _download_xml_handle(
  816             self, url_or_request, video_id, note='Downloading XML',
  817             errnote='Unable to download XML', transform_source=None,
  818             fatal=True, encoding=None, data=None, headers={}, query={},
  819             expected_status=None):
  820         """
  821         Return a tuple (xml as an compat_etree_Element, URL handle).
  822 
  823         See _download_webpage docstring for arguments specification.
  824         """
  825         res = self._download_webpage_handle(
  826             url_or_request, video_id, note, errnote, fatal=fatal,
  827             encoding=encoding, data=data, headers=headers, query=query,
  828             expected_status=expected_status)
  829         if res is False:
  830             return res
  831         xml_string, urlh = res
  832         return self._parse_xml(
  833             xml_string, video_id, transform_source=transform_source,
  834             fatal=fatal), urlh
  835 
  836     def _download_xml(
  837             self, url_or_request, video_id,
  838             note='Downloading XML', errnote='Unable to download XML',
  839             transform_source=None, fatal=True, encoding=None,
  840             data=None, headers={}, query={}, expected_status=None):
  841         """
  842         Return the xml as an compat_etree_Element.
  843 
  844         See _download_webpage docstring for arguments specification.
  845         """
  846         res = self._download_xml_handle(
  847             url_or_request, video_id, note=note, errnote=errnote,
  848             transform_source=transform_source, fatal=fatal, encoding=encoding,
  849             data=data, headers=headers, query=query,
  850             expected_status=expected_status)
  851         return res if res is False else res[0]
  852 
  853     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
  854         if transform_source:
  855             xml_string = transform_source(xml_string)
  856         try:
  857             return compat_etree_fromstring(xml_string.encode('utf-8'))
  858         except compat_xml_parse_error as ve:
  859             errmsg = '%s: Failed to parse XML ' % video_id
  860             if fatal:
  861                 raise ExtractorError(errmsg, cause=ve)
  862             else:
  863                 self.report_warning(errmsg + str(ve))
  864 
  865     def _download_json_handle(
  866             self, url_or_request, video_id, note='Downloading JSON metadata',
  867             errnote='Unable to download JSON metadata', transform_source=None,
  868             fatal=True, encoding=None, data=None, headers={}, query={},
  869             expected_status=None):
  870         """
  871         Return a tuple (JSON object, URL handle).
  872 
  873         See _download_webpage docstring for arguments specification.
  874         """
  875         res = self._download_webpage_handle(
  876             url_or_request, video_id, note, errnote, fatal=fatal,
  877             encoding=encoding, data=data, headers=headers, query=query,
  878             expected_status=expected_status)
  879         if res is False:
  880             return res
  881         json_string, urlh = res
  882         return self._parse_json(
  883             json_string, video_id, transform_source=transform_source,
  884             fatal=fatal), urlh
  885 
  886     def _download_json(
  887             self, url_or_request, video_id, note='Downloading JSON metadata',
  888             errnote='Unable to download JSON metadata', transform_source=None,
  889             fatal=True, encoding=None, data=None, headers={}, query={},
  890             expected_status=None):
  891         """
  892         Return the JSON object as a dict.
  893 
  894         See _download_webpage docstring for arguments specification.
  895         """
  896         res = self._download_json_handle(
  897             url_or_request, video_id, note=note, errnote=errnote,
  898             transform_source=transform_source, fatal=fatal, encoding=encoding,
  899             data=data, headers=headers, query=query,
  900             expected_status=expected_status)
  901         return res if res is False else res[0]
  902 
  903     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
  904         if transform_source:
  905             json_string = transform_source(json_string)
  906         try:
  907             return json.loads(json_string)
  908         except ValueError as ve:
  909             errmsg = '%s: Failed to parse JSON ' % video_id
  910             if fatal:
  911                 raise ExtractorError(errmsg, cause=ve)
  912             else:
  913                 self.report_warning(errmsg + str(ve))
  914 
  915     def report_warning(self, msg, video_id=None):
  916         idstr = '' if video_id is None else '%s: ' % video_id
  917         self._downloader.report_warning(
  918             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
  919 
  920     def to_screen(self, msg):
  921         """Print msg to screen, prefixing it with '[ie_name]'"""
  922         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
  923 
  924     def report_extraction(self, id_or_name):
  925         """Report information extraction."""
  926         self.to_screen('%s: Extracting information' % id_or_name)
  927 
  928     def report_download_webpage(self, video_id):
  929         """Report webpage download."""
  930         self.to_screen('%s: Downloading webpage' % video_id)
  931 
  932     def report_age_confirmation(self):
  933         """Report attempt to confirm age."""
  934         self.to_screen('Confirming age')
  935 
  936     def report_login(self):
  937         """Report attempt to log in."""
  938         self.to_screen('Logging in')
  939 
  940     @staticmethod
  941     def raise_login_required(msg='This video is only available for registered users'):
  942         raise ExtractorError(
  943             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
  944             expected=True)
  945 
  946     @staticmethod
  947     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
  948         raise GeoRestrictedError(msg, countries=countries)
  949 
  950     # Methods for following #608
  951     @staticmethod
  952     def url_result(url, ie=None, video_id=None, video_title=None):
  953         """Returns a URL that points to a page that should be processed"""
  954         # TODO: ie should be the class used for getting the info
  955         video_info = {'_type': 'url',
  956                       'url': url,
  957                       'ie_key': ie}
  958         if video_id is not None:
  959             video_info['id'] = video_id
  960         if video_title is not None:
  961             video_info['title'] = video_title
  962         return video_info
  963 
  964     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
  965         urls = orderedSet(
  966             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
  967             for m in matches)
  968         return self.playlist_result(
  969             urls, playlist_id=playlist_id, playlist_title=playlist_title)
  970 
  971     @staticmethod
  972     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
  973         """Returns a playlist"""
  974         video_info = {'_type': 'playlist',
  975                       'entries': entries}
  976         if playlist_id:
  977             video_info['id'] = playlist_id
  978         if playlist_title:
  979             video_info['title'] = playlist_title
  980         if playlist_description:
  981             video_info['description'] = playlist_description
  982         return video_info
  983 
  984     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
  985         """
  986         Perform a regex search on the given string, using a single or a list of
  987         patterns returning the first matching group.
  988         In case of failure return a default value or raise a WARNING or a
  989         RegexNotFoundError, depending on fatal, specifying the field name.
  990         """
  991         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
  992             mobj = re.search(pattern, string, flags)
  993         else:
  994             for p in pattern:
  995                 mobj = re.search(p, string, flags)
  996                 if mobj:
  997                     break
  998 
  999         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 1000             _name = '\033[0;34m%s\033[0m' % name
 1001         else:
 1002             _name = name
 1003 
 1004         if mobj:
 1005             if group is None:
 1006                 # return the first matching group
 1007                 return next(g for g in mobj.groups() if g is not None)
 1008             else:
 1009                 return mobj.group(group)
 1010         elif default is not NO_DEFAULT:
 1011             return default
 1012         elif fatal:
 1013             raise RegexNotFoundError('Unable to extract %s' % _name)
 1014         else:
 1015             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 1016             return None
 1017 
 1018     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 1019         """
 1020         Like _search_regex, but strips HTML tags and unescapes entities.
 1021         """
 1022         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 1023         if res:
 1024             return clean_html(res).strip()
 1025         else:
 1026             return res
 1027 
 1028     def _get_netrc_login_info(self, netrc_machine=None):
 1029         username = None
 1030         password = None
 1031         netrc_machine = netrc_machine or self._NETRC_MACHINE
 1032 
 1033         if self._downloader.params.get('usenetrc', False):
 1034             try:
 1035                 info = netrc.netrc().authenticators(netrc_machine)
 1036                 if info is not None:
 1037                     username = info[0]
 1038                     password = info[2]
 1039                 else:
 1040                     raise netrc.NetrcParseError(
 1041                         'No authenticators for %s' % netrc_machine)
 1042             except (IOError, netrc.NetrcParseError) as err:
 1043                 self._downloader.report_warning(
 1044                     'parsing .netrc: %s' % error_to_compat_str(err))
 1045 
 1046         return username, password
 1047 
 1048     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 1049         """
 1050         Get the login info as (username, password)
 1051         First look for the manually specified credentials using username_option
 1052         and password_option as keys in params dictionary. If no such credentials
 1053         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 1054         value.
 1055         If there's no info available, return (None, None)
 1056         """
 1057         if self._downloader is None:
 1058             return (None, None)
 1059 
 1060         downloader_params = self._downloader.params
 1061 
 1062         # Attempt to use provided username and password or .netrc data
 1063         if downloader_params.get(username_option) is not None:
 1064             username = downloader_params[username_option]
 1065             password = downloader_params[password_option]
 1066         else:
 1067             username, password = self._get_netrc_login_info(netrc_machine)
 1068 
 1069         return username, password
 1070 
 1071     def _get_tfa_info(self, note='two-factor verification code'):
 1072         """
 1073         Get the two-factor authentication info
 1074         TODO - asking the user will be required for sms/phone verify
 1075         currently just uses the command line option
 1076         If there's no info available, return None
 1077         """
 1078         if self._downloader is None:
 1079             return None
 1080         downloader_params = self._downloader.params
 1081 
 1082         if downloader_params.get('twofactor') is not None:
 1083             return downloader_params['twofactor']
 1084 
 1085         return compat_getpass('Type %s and press [Return]: ' % note)
 1086 
 1087     # Helper functions for extracting OpenGraph info
 1088     @staticmethod
 1089     def _og_regexes(prop):
 1090         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 1091         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
 1092                        % {'prop': re.escape(prop)})
 1093         template = r'<meta[^>]+?%s[^>]+?%s'
 1094         return [
 1095             template % (property_re, content_re),
 1096             template % (content_re, property_re),
 1097         ]
 1098 
 1099     @staticmethod
 1100     def _meta_regex(prop):
 1101         return r'''(?isx)<meta
 1102                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 1103                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 1104 
 1105     def _og_search_property(self, prop, html, name=None, **kargs):
 1106         if not isinstance(prop, (list, tuple)):
 1107             prop = [prop]
 1108         if name is None:
 1109             name = 'OpenGraph %s' % prop[0]
 1110         og_regexes = []
 1111         for p in prop:
 1112             og_regexes.extend(self._og_regexes(p))
 1113         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 1114         if escaped is None:
 1115             return None
 1116         return unescapeHTML(escaped)
 1117 
 1118     def _og_search_thumbnail(self, html, **kargs):
 1119         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 1120 
 1121     def _og_search_description(self, html, **kargs):
 1122         return self._og_search_property('description', html, fatal=False, **kargs)
 1123 
 1124     def _og_search_title(self, html, **kargs):
 1125         return self._og_search_property('title', html, **kargs)
 1126 
 1127     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 1128         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 1129         if secure:
 1130             regexes = self._og_regexes('video:secure_url') + regexes
 1131         return self._html_search_regex(regexes, html, name, **kargs)
 1132 
 1133     def _og_search_url(self, html, **kargs):
 1134         return self._og_search_property('url', html, **kargs)
 1135 
 1136     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 1137         if not isinstance(name, (list, tuple)):
 1138             name = [name]
 1139         if display_name is None:
 1140             display_name = name[0]
 1141         return self._html_search_regex(
 1142             [self._meta_regex(n) for n in name],
 1143             html, display_name, fatal=fatal, group='content', **kwargs)
 1144 
 1145     def _dc_search_uploader(self, html):
 1146         return self._html_search_meta('dc.creator', html, 'uploader')
 1147 
 1148     def _rta_search(self, html):
 1149         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 1150         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 1151                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 1152                      html):
 1153             return 18
 1154         return 0
 1155 
 1156     def _media_rating_search(self, html):
 1157         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 1158         rating = self._html_search_meta('rating', html)
 1159 
 1160         if not rating:
 1161             return None
 1162 
 1163         RATING_TABLE = {
 1164             'safe for kids': 0,
 1165             'general': 8,
 1166             '14 years': 14,
 1167             'mature': 17,
 1168             'restricted': 19,
 1169         }
 1170         return RATING_TABLE.get(rating.lower())
 1171 
 1172     def _family_friendly_search(self, html):
 1173         # See http://schema.org/VideoObject
 1174         family_friendly = self._html_search_meta(
 1175             'isFamilyFriendly', html, default=None)
 1176 
 1177         if not family_friendly:
 1178             return None
 1179 
 1180         RATING_TABLE = {
 1181             '1': 0,
 1182             'true': 0,
 1183             '0': 18,
 1184             'false': 18,
 1185         }
 1186         return RATING_TABLE.get(family_friendly.lower())
 1187 
 1188     def _twitter_search_player(self, html):
 1189         return self._html_search_meta('twitter:player', html,
 1190                                       'twitter card player')
 1191 
 1192     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 1193         json_ld_list = list(re.finditer(JSON_LD_RE, html))
 1194         default = kwargs.get('default', NO_DEFAULT)
 1195         # JSON-LD may be malformed and thus `fatal` should be respected.
 1196         # At the same time `default` may be passed that assumes `fatal=False`
 1197         # for _search_regex. Let's simulate the same behavior here as well.
 1198         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 1199         json_ld = []
 1200         for mobj in json_ld_list:
 1201             json_ld_item = self._parse_json(
 1202                 mobj.group('json_ld'), video_id, fatal=fatal)
 1203             if not json_ld_item:
 1204                 continue
 1205             if isinstance(json_ld_item, dict):
 1206                 json_ld.append(json_ld_item)
 1207             elif isinstance(json_ld_item, (list, tuple)):
 1208                 json_ld.extend(json_ld_item)
 1209         if json_ld:
 1210             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 1211         if json_ld:
 1212             return json_ld
 1213         if default is not NO_DEFAULT:
 1214             return default
 1215         elif fatal:
 1216             raise RegexNotFoundError('Unable to extract JSON-LD')
 1217         else:
 1218             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
 1219             return {}
 1220 
 1221     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 1222         if isinstance(json_ld, compat_str):
 1223             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 1224         if not json_ld:
 1225             return {}
 1226         info = {}
 1227         if not isinstance(json_ld, (list, tuple, dict)):
 1228             return info
 1229         if isinstance(json_ld, dict):
 1230             json_ld = [json_ld]
 1231 
 1232         INTERACTION_TYPE_MAP = {
 1233             'CommentAction': 'comment',
 1234             'AgreeAction': 'like',
 1235             'DisagreeAction': 'dislike',
 1236             'LikeAction': 'like',
 1237             'DislikeAction': 'dislike',
 1238             'ListenAction': 'view',
 1239             'WatchAction': 'view',
 1240             'ViewAction': 'view',
 1241         }
 1242 
 1243         def extract_interaction_type(e):
 1244             interaction_type = e.get('interactionType')
 1245             if isinstance(interaction_type, dict):
 1246                 interaction_type = interaction_type.get('@type')
 1247             return str_or_none(interaction_type)
 1248 
 1249         def extract_interaction_statistic(e):
 1250             interaction_statistic = e.get('interactionStatistic')
 1251             if isinstance(interaction_statistic, dict):
 1252                 interaction_statistic = [interaction_statistic]
 1253             if not isinstance(interaction_statistic, list):
 1254                 return
 1255             for is_e in interaction_statistic:
 1256                 if not isinstance(is_e, dict):
 1257                     continue
 1258                 if is_e.get('@type') != 'InteractionCounter':
 1259                     continue
 1260                 interaction_type = extract_interaction_type(is_e)
 1261                 if not interaction_type:
 1262                     continue
 1263                 # For interaction count some sites provide string instead of
 1264                 # an integer (as per spec) with non digit characters (e.g. ",")
 1265                 # so extracting count with more relaxed str_to_int
 1266                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
 1267                 if interaction_count is None:
 1268                     continue
 1269                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
 1270                 if not count_kind:
 1271                     continue
 1272                 count_key = '%s_count' % count_kind
 1273                 if info.get(count_key) is not None:
 1274                     continue
 1275                 info[count_key] = interaction_count
 1276 
 1277         def extract_video_object(e):
 1278             assert e['@type'] == 'VideoObject'
 1279             author = e.get('author')
 1280             info.update({
 1281                 'url': url_or_none(e.get('contentUrl')),
 1282                 'title': unescapeHTML(e.get('name')),
 1283                 'description': unescapeHTML(e.get('description')),
 1284                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
 1285                 'duration': parse_duration(e.get('duration')),
 1286                 'timestamp': unified_timestamp(e.get('uploadDate')),
 1287                 # author can be an instance of 'Organization' or 'Person' types.
 1288                 # both types can have 'name' property(inherited from 'Thing' type). [1]
 1289                 # however some websites are using 'Text' type instead.
 1290                 # 1. https://schema.org/VideoObject
 1291                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
 1292                 'filesize': float_or_none(e.get('contentSize')),
 1293                 'tbr': int_or_none(e.get('bitrate')),
 1294                 'width': int_or_none(e.get('width')),
 1295                 'height': int_or_none(e.get('height')),
 1296                 'view_count': int_or_none(e.get('interactionCount')),
 1297             })
 1298             extract_interaction_statistic(e)
 1299 
 1300         for e in json_ld:
 1301             if '@context' in e:
 1302                 item_type = e.get('@type')
 1303                 if expected_type is not None and expected_type != item_type:
 1304                     continue
 1305                 if item_type in ('TVEpisode', 'Episode'):
 1306                     episode_name = unescapeHTML(e.get('name'))
 1307                     info.update({
 1308                         'episode': episode_name,
 1309                         'episode_number': int_or_none(e.get('episodeNumber')),
 1310                         'description': unescapeHTML(e.get('description')),
 1311                     })
 1312                     if not info.get('title') and episode_name:
 1313                         info['title'] = episode_name
 1314                     part_of_season = e.get('partOfSeason')
 1315                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
 1316                         info.update({
 1317                             'season': unescapeHTML(part_of_season.get('name')),
 1318                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
 1319                         })
 1320                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 1321                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
 1322                         info['series'] = unescapeHTML(part_of_series.get('name'))
 1323                 elif item_type == 'Movie':
 1324                     info.update({
 1325                         'title': unescapeHTML(e.get('name')),
 1326                         'description': unescapeHTML(e.get('description')),
 1327                         'duration': parse_duration(e.get('duration')),
 1328                         'timestamp': unified_timestamp(e.get('dateCreated')),
 1329                     })
 1330                 elif item_type in ('Article', 'NewsArticle'):
 1331                     info.update({
 1332                         'timestamp': parse_iso8601(e.get('datePublished')),
 1333                         'title': unescapeHTML(e.get('headline')),
 1334                         'description': unescapeHTML(e.get('articleBody')),
 1335                     })
 1336                 elif item_type == 'VideoObject':
 1337                     extract_video_object(e)
 1338                     if expected_type is None:
 1339                         continue
 1340                     else:
 1341                         break
 1342                 video = e.get('video')
 1343                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
 1344                     extract_video_object(video)
 1345                 if expected_type is None:
 1346                     continue
 1347                 else:
 1348                     break
 1349         return dict((k, v) for k, v in info.items() if v is not None)
 1350 
 1351     @staticmethod
 1352     def _hidden_inputs(html):
 1353         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 1354         hidden_inputs = {}
 1355         for input in re.findall(r'(?i)(<input[^>]+>)', html):
 1356             attrs = extract_attributes(input)
 1357             if not input:
 1358                 continue
 1359             if attrs.get('type') not in ('hidden', 'submit'):
 1360                 continue
 1361             name = attrs.get('name') or attrs.get('id')
 1362             value = attrs.get('value')
 1363             if name and value is not None:
 1364                 hidden_inputs[name] = value
 1365         return hidden_inputs
 1366 
 1367     def _form_hidden_inputs(self, form_id, html):
 1368         form = self._search_regex(
 1369             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 1370             html, '%s form' % form_id, group='form')
 1371         return self._hidden_inputs(form)
 1372 
 1373     def _sort_formats(self, formats, field_preference=None):
 1374         if not formats:
 1375             raise ExtractorError('No video formats found')
 1376 
 1377         for f in formats:
 1378             # Automatically determine tbr when missing based on abr and vbr (improves
 1379             # formats sorting in some cases)
 1380             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 1381                 f['tbr'] = f['abr'] + f['vbr']
 1382 
 1383         def _formats_key(f):
 1384             # TODO remove the following workaround
 1385             from ..utils import determine_ext
 1386             if not f.get('ext') and 'url' in f:
 1387                 f['ext'] = determine_ext(f['url'])
 1388 
 1389             if isinstance(field_preference, (list, tuple)):
 1390                 return tuple(
 1391                     f.get(field)
 1392                     if f.get(field) is not None
 1393                     else ('' if field == 'format_id' else -1)
 1394                     for field in field_preference)
 1395 
 1396             preference = f.get('preference')
 1397             if preference is None:
 1398                 preference = 0
 1399                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 1400                     preference -= 0.5
 1401 
 1402             protocol = f.get('protocol') or determine_protocol(f)
 1403             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
 1404 
 1405             if f.get('vcodec') == 'none':  # audio only
 1406                 preference -= 50
 1407                 if self._downloader.params.get('prefer_free_formats'):
 1408                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 1409                 else:
 1410                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 1411                 ext_preference = 0
 1412                 try:
 1413                     audio_ext_preference = ORDER.index(f['ext'])
 1414                 except ValueError:
 1415                     audio_ext_preference = -1
 1416             else:
 1417                 if f.get('acodec') == 'none':  # video only
 1418                     preference -= 40
 1419                 if self._downloader.params.get('prefer_free_formats'):
 1420                     ORDER = ['flv', 'mp4', 'webm']
 1421                 else:
 1422                     ORDER = ['webm', 'flv', 'mp4']
 1423                 try:
 1424                     ext_preference = ORDER.index(f['ext'])
 1425                 except ValueError:
 1426                     ext_preference = -1
 1427                 audio_ext_preference = 0
 1428 
 1429             return (
 1430                 preference,
 1431                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 1432                 f.get('quality') if f.get('quality') is not None else -1,
 1433                 f.get('tbr') if f.get('tbr') is not None else -1,
 1434                 f.get('filesize') if f.get('filesize') is not None else -1,
 1435                 f.get('vbr') if f.get('vbr') is not None else -1,
 1436                 f.get('height') if f.get('height') is not None else -1,
 1437                 f.get('width') if f.get('width') is not None else -1,
 1438                 proto_preference,
 1439                 ext_preference,
 1440                 f.get('abr') if f.get('abr') is not None else -1,
 1441                 audio_ext_preference,
 1442                 f.get('fps') if f.get('fps') is not None else -1,
 1443                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 1444                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 1445                 f.get('format_id') if f.get('format_id') is not None else '',
 1446             )
 1447         formats.sort(key=_formats_key)
 1448 
 1449     def _check_formats(self, formats, video_id):
 1450         if formats:
 1451             formats[:] = filter(
 1452                 lambda f: self._is_valid_url(
 1453                     f['url'], video_id,
 1454                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 1455                 formats)
 1456 
 1457     @staticmethod
 1458     def _remove_duplicate_formats(formats):
 1459         format_urls = set()
 1460         unique_formats = []
 1461         for f in formats:
 1462             if f['url'] not in format_urls:
 1463                 format_urls.add(f['url'])
 1464                 unique_formats.append(f)
 1465         formats[:] = unique_formats
 1466 
 1467     def _is_valid_url(self, url, video_id, item='video', headers={}):
 1468         url = self._proto_relative_url(url, scheme='http:')
 1469         # For now assume non HTTP(S) URLs always valid
 1470         if not (url.startswith('http://') or url.startswith('https://')):
 1471             return True
 1472         try:
 1473             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
 1474             return True
 1475         except ExtractorError as e:
 1476             self.to_screen(
 1477                 '%s: %s URL is invalid, skipping: %s'
 1478                 % (video_id, item, error_to_compat_str(e.cause)))
 1479             return False
 1480 
 1481     def http_scheme(self):
 1482         """ Either "http:" or "https:", depending on the user's preferences """
 1483         return (
 1484             'http:'
 1485             if self._downloader.params.get('prefer_insecure', False)
 1486             else 'https:')
 1487 
 1488     def _proto_relative_url(self, url, scheme=None):
 1489         if url is None:
 1490             return url
 1491         if url.startswith('//'):
 1492             if scheme is None:
 1493                 scheme = self.http_scheme()
 1494             return scheme + url
 1495         else:
 1496             return url
 1497 
 1498     def _sleep(self, timeout, video_id, msg_template=None):
 1499         if msg_template is None:
 1500             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 1501         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 1502         self.to_screen(msg)
 1503         time.sleep(timeout)
 1504 
 1505     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 1506                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 1507                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
 1508         manifest = self._download_xml(
 1509             manifest_url, video_id, 'Downloading f4m manifest',
 1510             'Unable to download f4m manifest',
 1511             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 1512             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
 1513             transform_source=transform_source,
 1514             fatal=fatal, data=data, headers=headers, query=query)
 1515 
 1516         if manifest is False:
 1517             return []
 1518 
 1519         return self._parse_f4m_formats(
 1520             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 1521             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
 1522 
 1523     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
 1524                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
 1525                            fatal=True, m3u8_id=None):
 1526         if not isinstance(manifest, compat_etree_Element) and not fatal:
 1527             return []
 1528 
 1529         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
 1530         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
 1531         if akamai_pv is not None and ';' in akamai_pv.text:
 1532             playerVerificationChallenge = akamai_pv.text.split(';')[0]
 1533             if playerVerificationChallenge.strip() != '':
 1534                 return []
 1535 
 1536         formats = []
 1537         manifest_version = '1.0'
 1538         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 1539         if not media_nodes:
 1540             manifest_version = '2.0'
 1541             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 1542         # Remove unsupported DRM protected media from final formats
 1543         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
 1544         media_nodes = remove_encrypted_media(media_nodes)
 1545         if not media_nodes:
 1546             return formats
 1547 
 1548         manifest_base_url = get_base_url(manifest)
 1549 
 1550         bootstrap_info = xpath_element(
 1551             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
 1552             'bootstrap info', default=None)
 1553 
 1554         vcodec = None
 1555         mime_type = xpath_text(
 1556             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
 1557             'base URL', default=None)
 1558         if mime_type and mime_type.startswith('audio/'):
 1559             vcodec = 'none'
 1560 
 1561         for i, media_el in enumerate(media_nodes):
 1562             tbr = int_or_none(media_el.attrib.get('bitrate'))
 1563             width = int_or_none(media_el.attrib.get('width'))
 1564             height = int_or_none(media_el.attrib.get('height'))
 1565             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
 1566             # If <bootstrapInfo> is present, the specified f4m is a
 1567             # stream-level manifest, and only set-level manifests may refer to
 1568             # external resources.  See section 11.4 and section 4 of F4M spec
 1569             if bootstrap_info is None:
 1570                 media_url = None
 1571                 # @href is introduced in 2.0, see section 11.6 of F4M spec
 1572                 if manifest_version == '2.0':
 1573                     media_url = media_el.attrib.get('href')
 1574                 if media_url is None:
 1575                     media_url = media_el.attrib.get('url')
 1576                 if not media_url:
 1577                     continue
 1578                 manifest_url = (
 1579                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 1580                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 1581                 # If media_url is itself a f4m manifest do the recursive extraction
 1582                 # since bitrates in parent manifest (this one) and media_url manifest
 1583                 # may differ leading to inability to resolve the format by requested
 1584                 # bitrate in f4m downloader
 1585                 ext = determine_ext(manifest_url)
 1586                 if ext == 'f4m':
 1587                     f4m_formats = self._extract_f4m_formats(
 1588                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 1589                         transform_source=transform_source, fatal=fatal)
 1590                     # Sometimes stream-level manifest contains single media entry that
 1591                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
 1592                     # At the same time parent's media entry in set-level manifest may
 1593                     # contain it. We will copy it from parent in such cases.
 1594                     if len(f4m_formats) == 1:
 1595                         f = f4m_formats[0]
 1596                         f.update({
 1597                             'tbr': f.get('tbr') or tbr,
 1598                             'width': f.get('width') or width,
 1599                             'height': f.get('height') or height,
 1600                             'format_id': f.get('format_id') if not tbr else format_id,
 1601                             'vcodec': vcodec,
 1602                         })
 1603                     formats.extend(f4m_formats)
 1604                     continue
 1605                 elif ext == 'm3u8':
 1606                     formats.extend(self._extract_m3u8_formats(
 1607                         manifest_url, video_id, 'mp4', preference=preference,
 1608                         m3u8_id=m3u8_id, fatal=fatal))
 1609                     continue
 1610             formats.append({
 1611                 'format_id': format_id,
 1612                 'url': manifest_url,
 1613                 'manifest_url': manifest_url,
 1614                 'ext': 'flv' if bootstrap_info is not None else None,
 1615                 'protocol': 'f4m',
 1616                 'tbr': tbr,
 1617                 'width': width,
 1618                 'height': height,
 1619                 'vcodec': vcodec,
 1620                 'preference': preference,
 1621             })
 1622         return formats
 1623 
 1624     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
 1625         return {
 1626             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 1627             'url': m3u8_url,
 1628             'ext': ext,
 1629             'protocol': 'm3u8',
 1630             'preference': preference - 100 if preference else -100,
 1631             'resolution': 'multiple',
 1632             'format_note': 'Quality selection URL',
 1633         }
 1634 
 1635     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 1636                               entry_protocol='m3u8', preference=None,
 1637                               m3u8_id=None, note=None, errnote=None,
 1638                               fatal=True, live=False, data=None, headers={},
 1639                               query={}):
 1640         res = self._download_webpage_handle(
 1641             m3u8_url, video_id,
 1642             note=note or 'Downloading m3u8 information',
 1643             errnote=errnote or 'Failed to download m3u8 information',
 1644             fatal=fatal, data=data, headers=headers, query=query)
 1645 
 1646         if res is False:
 1647             return []
 1648 
 1649         m3u8_doc, urlh = res
 1650         m3u8_url = urlh.geturl()
 1651 
 1652         return self._parse_m3u8_formats(
 1653             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
 1654             preference=preference, m3u8_id=m3u8_id, live=live)
 1655 
 1656     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
 1657                             entry_protocol='m3u8', preference=None,
 1658                             m3u8_id=None, live=False):
 1659         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
 1660             return []
 1661 
 1662         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
 1663             return []
 1664 
 1665         formats = []
 1666 
 1667         format_url = lambda u: (
 1668             u
 1669             if re.match(r'^https?://', u)
 1670             else compat_urlparse.urljoin(m3u8_url, u))
 1671 
 1672         # References:
 1673         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
 1674         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
 1675         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
 1676 
 1677         # We should try extracting formats only from master playlists [1, 4.3.4],
 1678         # i.e. playlists that describe available qualities. On the other hand
 1679         # media playlists [1, 4.3.3] should be returned as is since they contain
 1680         # just the media without qualities renditions.
 1681         # Fortunately, master playlist can be easily distinguished from media
 1682         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
 1683         # master playlist tags MUST NOT appear in a media playlist and vice versa.
 1684         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
 1685         # media playlist and MUST NOT appear in master playlist thus we can
 1686         # clearly detect media playlist with this criterion.
 1687 
 1688         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
 1689             return [{
 1690                 'url': m3u8_url,
 1691                 'format_id': m3u8_id,
 1692                 'ext': ext,
 1693                 'protocol': entry_protocol,
 1694                 'preference': preference,
 1695             }]
 1696 
 1697         groups = {}
 1698         last_stream_inf = {}
 1699 
 1700         def extract_media(x_media_line):
 1701             media = parse_m3u8_attributes(x_media_line)
 1702             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
 1703             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
 1704             if not (media_type and group_id and name):
 1705                 return
 1706             groups.setdefault(group_id, []).append(media)
 1707             if media_type not in ('VIDEO', 'AUDIO'):
 1708                 return
 1709             media_url = media.get('URI')
 1710             if media_url:
 1711                 format_id = []
 1712                 for v in (m3u8_id, group_id, name):
 1713                     if v:
 1714                         format_id.append(v)
 1715                 f = {
 1716                     'format_id': '-'.join(format_id),
 1717                     'url': format_url(media_url),
 1718                     'manifest_url': m3u8_url,
 1719                     'language': media.get('LANGUAGE'),
 1720                     'ext': ext,
 1721                     'protocol': entry_protocol,
 1722                     'preference': preference,
 1723                 }
 1724                 if media_type == 'AUDIO':
 1725                     f['vcodec'] = 'none'
 1726                 formats.append(f)
 1727 
 1728         def build_stream_name():
 1729             # Despite specification does not mention NAME attribute for
 1730             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
 1731             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
 1732             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
 1733             stream_name = last_stream_inf.get('NAME')
 1734             if stream_name:
 1735                 return stream_name
 1736             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
 1737             # from corresponding rendition group
 1738             stream_group_id = last_stream_inf.get('VIDEO')
 1739             if not stream_group_id:
 1740                 return
 1741             stream_group = groups.get(stream_group_id)
 1742             if not stream_group:
 1743                 return stream_group_id
 1744             rendition = stream_group[0]
 1745             return rendition.get('NAME') or stream_group_id
 1746 
 1747         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
 1748         # chance to detect video only formats when EXT-X-STREAM-INF tags
 1749         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
 1750         for line in m3u8_doc.splitlines():
 1751             if line.startswith('#EXT-X-MEDIA:'):
 1752                 extract_media(line)
 1753 
 1754         for line in m3u8_doc.splitlines():
 1755             if line.startswith('#EXT-X-STREAM-INF:'):
 1756                 last_stream_inf = parse_m3u8_attributes(line)
 1757             elif line.startswith('#') or not line.strip():
 1758                 continue
 1759             else:
 1760                 tbr = float_or_none(
 1761                     last_stream_inf.get('AVERAGE-BANDWIDTH')
 1762                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
 1763                 format_id = []
 1764                 if m3u8_id:
 1765                     format_id.append(m3u8_id)
 1766                 stream_name = build_stream_name()
 1767                 # Bandwidth of live streams may differ over time thus making
 1768                 # format_id unpredictable. So it's better to keep provided
 1769                 # format_id intact.
 1770                 if not live:
 1771                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
 1772                 manifest_url = format_url(line.strip())
 1773                 f = {
 1774                     'format_id': '-'.join(format_id),
 1775                     'url': manifest_url,
 1776                     'manifest_url': m3u8_url,
 1777                     'tbr': tbr,
 1778                     'ext': ext,
 1779                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
 1780                     'protocol': entry_protocol,
 1781                     'preference': preference,
 1782                 }
 1783                 resolution = last_stream_inf.get('RESOLUTION')
 1784                 if resolution:
 1785                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
 1786                     if mobj:
 1787                         f['width'] = int(mobj.group('width'))
 1788                         f['height'] = int(mobj.group('height'))
 1789                 # Unified Streaming Platform
 1790                 mobj = re.search(
 1791                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
 1792                 if mobj:
 1793                     abr, vbr = mobj.groups()
 1794                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
 1795                     f.update({
 1796                         'vbr': vbr,
 1797                         'abr': abr,
 1798                     })
 1799                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
 1800                 f.update(codecs)
 1801                 audio_group_id = last_stream_inf.get('AUDIO')
 1802                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
 1803                 # references a rendition group MUST have a CODECS attribute.
 1804                 # However, this is not always respected, for example, [2]
 1805                 # contains EXT-X-STREAM-INF tag which references AUDIO
 1806                 # rendition group but does not have CODECS and despite
 1807                 # referencing an audio group it represents a complete
 1808                 # (with audio and video) format. So, for such cases we will
 1809                 # ignore references to rendition groups and treat them
 1810                 # as complete formats.
 1811                 if audio_group_id and codecs and f.get('vcodec') != 'none':
 1812                     audio_group = groups.get(audio_group_id)
 1813                     if audio_group and audio_group[0].get('URI'):
 1814                         # TODO: update acodec for audio only formats with
 1815                         # the same GROUP-ID
 1816                         f['acodec'] = 'none'
 1817                 formats.append(f)
 1818 
 1819                 # for DailyMotion
 1820                 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
 1821                 if progressive_uri:
 1822                     http_f = f.copy()
 1823                     del http_f['manifest_url']
 1824                     http_f.update({
 1825                         'format_id': f['format_id'].replace('hls-', 'http-'),
 1826                         'protocol': 'http',
 1827                         'url': progressive_uri,
 1828                     })
 1829                     formats.append(http_f)
 1830 
 1831                 last_stream_inf = {}
 1832         return formats
 1833 
 1834     @staticmethod
 1835     def _xpath_ns(path, namespace=None):
 1836         if not namespace:
 1837             return path
 1838         out = []
 1839         for c in path.split('/'):
 1840             if not c or c == '.':
 1841                 out.append(c)
 1842             else:
 1843                 out.append('{%s}%s' % (namespace, c))
 1844         return '/'.join(out)
 1845 
 1846     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
 1847         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
 1848 
 1849         if smil is False:
 1850             assert not fatal
 1851             return []
 1852 
 1853         namespace = self._parse_smil_namespace(smil)
 1854 
 1855         return self._parse_smil_formats(
 1856             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
 1857 
 1858     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
 1859         smil = self._download_smil(smil_url, video_id, fatal=fatal)
 1860         if smil is False:
 1861             return {}
 1862         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
 1863 
 1864     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
 1865         return self._download_xml(
 1866             smil_url, video_id, 'Downloading SMIL file',
 1867             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
 1868 
 1869     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
 1870         namespace = self._parse_smil_namespace(smil)
 1871 
 1872         formats = self._parse_smil_formats(
 1873             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
 1874         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
 1875 
 1876         video_id = os.path.splitext(url_basename(smil_url))[0]
 1877         title = None
 1878         description = None
 1879         upload_date = None
 1880         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 1881             name = meta.attrib.get('name')
 1882             content = meta.attrib.get('content')
 1883             if not name or not content:
 1884                 continue
 1885             if not title and name == 'title':
 1886                 title = content
 1887             elif not description and name in ('description', 'abstract'):
 1888                 description = content
 1889             elif not upload_date and name == 'date':
 1890                 upload_date = unified_strdate(content)
 1891 
 1892         thumbnails = [{
 1893             'id': image.get('type'),
 1894             'url': image.get('src'),
 1895             'width': int_or_none(image.get('width')),
 1896             'height': int_or_none(image.get('height')),
 1897         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
 1898 
 1899         return {
 1900             'id': video_id,
 1901             'title': title or video_id,
 1902             'description': description,
 1903             'upload_date': upload_date,
 1904             'thumbnails': thumbnails,
 1905             'formats': formats,
 1906             'subtitles': subtitles,
 1907         }
 1908 
 1909     def _parse_smil_namespace(self, smil):
 1910         return self._search_regex(
 1911             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
 1912 
 1913     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
 1914         base = smil_url
 1915         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 1916             b = meta.get('base') or meta.get('httpBase')
 1917             if b:
 1918                 base = b
 1919                 break
 1920 
 1921         formats = []
 1922         rtmp_count = 0
 1923         http_count = 0
 1924         m3u8_count = 0
 1925 
 1926         srcs = []
 1927         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
 1928         for medium in media:
 1929             src = medium.get('src')
 1930             if not src or src in srcs:
 1931                 continue
 1932             srcs.append(src)
 1933 
 1934             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
 1935             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
 1936             width = int_or_none(medium.get('width'))
 1937             height = int_or_none(medium.get('height'))
 1938             proto = medium.get('proto')
 1939             ext = medium.get('ext')
 1940             src_ext = determine_ext(src)
 1941             streamer = medium.get('streamer') or base
 1942 
 1943             if proto == 'rtmp' or streamer.startswith('rtmp'):
 1944                 rtmp_count += 1
 1945                 formats.append({
 1946                     'url': streamer,
 1947                     'play_path': src,
 1948                     'ext': 'flv',
 1949                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 1950                     'tbr': bitrate,
 1951                     'filesize': filesize,
 1952                     'width': width,
 1953                     'height': height,
 1954                 })
 1955                 if transform_rtmp_url:
 1956                     streamer, src = transform_rtmp_url(streamer, src)
 1957                     formats[-1].update({
 1958                         'url': streamer,
 1959                         'play_path': src,
 1960                     })
 1961                 continue
 1962 
 1963             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
 1964             src_url = src_url.strip()
 1965 
 1966             if proto == 'm3u8' or src_ext == 'm3u8':
 1967                 m3u8_formats = self._extract_m3u8_formats(
 1968                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
 1969                 if len(m3u8_formats) == 1:
 1970                     m3u8_count += 1
 1971                     m3u8_formats[0].update({
 1972                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
 1973                         'tbr': bitrate,
 1974                         'width': width,
 1975                         'height': height,
 1976                     })
 1977                 formats.extend(m3u8_formats)
 1978             elif src_ext == 'f4m':
 1979                 f4m_url = src_url
 1980                 if not f4m_params:
 1981                     f4m_params = {
 1982                         'hdcore': '3.2.0',
 1983                         'plugin': 'flowplayer-3.2.0.1',
 1984                     }
 1985                 f4m_url += '&' if '?' in f4m_url else '?'
 1986                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
 1987                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
 1988             elif src_ext == 'mpd':
 1989                 formats.extend(self._extract_mpd_formats(
 1990                     src_url, video_id, mpd_id='dash', fatal=False))
 1991             elif re.search(r'\.ism/[Mm]anifest', src_url):
 1992                 formats.extend(self._extract_ism_formats(
 1993                     src_url, video_id, ism_id='mss', fatal=False))
 1994             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
 1995                 http_count += 1
 1996                 formats.append({
 1997                     'url': src_url,
 1998                     'ext': ext or src_ext or 'flv',
 1999                     'format_id': 'http-%d' % (bitrate or http_count),
 2000                     'tbr': bitrate,
 2001                     'filesize': filesize,
 2002                     'width': width,
 2003                     'height': height,
 2004                 })
 2005 
 2006         return formats
 2007 
 2008     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
 2009         urls = []
 2010         subtitles = {}
 2011         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
 2012             src = textstream.get('src')
 2013             if not src or src in urls:
 2014                 continue
 2015             urls.append(src)
 2016             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
 2017             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
 2018             subtitles.setdefault(lang, []).append({
 2019                 'url': src,
 2020                 'ext': ext,
 2021             })
 2022         return subtitles
 2023 
 2024     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
 2025         xspf = self._download_xml(
 2026             xspf_url, playlist_id, 'Downloading xpsf playlist',
 2027             'Unable to download xspf manifest', fatal=fatal)
 2028         if xspf is False:
 2029             return []
 2030         return self._parse_xspf(
 2031             xspf, playlist_id, xspf_url=xspf_url,
 2032             xspf_base_url=base_url(xspf_url))
 2033 
 2034     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
 2035         NS_MAP = {
 2036             'xspf': 'http://xspf.org/ns/0/',
 2037             's1': 'http://static.streamone.nl/player/ns/0',
 2038         }
 2039 
 2040         entries = []
 2041         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
 2042             title = xpath_text(
 2043                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
 2044             description = xpath_text(
 2045                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
 2046             thumbnail = xpath_text(
 2047                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
 2048             duration = float_or_none(
 2049                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
 2050 
 2051             formats = []
 2052             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
 2053                 format_url = urljoin(xspf_base_url, location.text)
 2054                 if not format_url:
 2055                     continue
 2056                 formats.append({
 2057                     'url': format_url,
 2058                     'manifest_url': xspf_url,
 2059                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
 2060                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
 2061                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
 2062                 })
 2063             self._sort_formats(formats)
 2064 
 2065             entries.append({
 2066                 'id': playlist_id,
 2067                 'title': title,
 2068                 'description': description,
 2069                 'thumbnail': thumbnail,
 2070                 'duration': duration,
 2071                 'formats': formats,
 2072             })
 2073         return entries
 2074 
 2075     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 2076         res = self._download_xml_handle(
 2077             mpd_url, video_id,
 2078             note=note or 'Downloading MPD manifest',
 2079             errnote=errnote or 'Failed to download MPD manifest',
 2080             fatal=fatal, data=data, headers=headers, query=query)
 2081         if res is False:
 2082             return []
 2083         mpd_doc, urlh = res
 2084         if mpd_doc is None:
 2085             return []
 2086         mpd_base_url = base_url(urlh.geturl())
 2087 
 2088         return self._parse_mpd_formats(
 2089             mpd_doc, mpd_id, mpd_base_url, mpd_url)
 2090 
 2091     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
 2092         """
 2093         Parse formats from MPD manifest.
 2094         References:
 2095          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
 2096             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
 2097          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
 2098         """
 2099         if mpd_doc.get('type') == 'dynamic':
 2100             return []
 2101 
 2102         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
 2103 
 2104         def _add_ns(path):
 2105             return self._xpath_ns(path, namespace)
 2106 
 2107         def is_drm_protected(element):
 2108             return element.find(_add_ns('ContentProtection')) is not None
 2109 
 2110         def extract_multisegment_info(element, ms_parent_info):
 2111             ms_info = ms_parent_info.copy()
 2112 
 2113             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
 2114             # common attributes and elements.  We will only extract relevant
 2115             # for us.
 2116             def extract_common(source):
 2117                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
 2118                 if segment_timeline is not None:
 2119                     s_e = segment_timeline.findall(_add_ns('S'))
 2120                     if s_e:
 2121                         ms_info['total_number'] = 0
 2122                         ms_info['s'] = []
 2123                         for s in s_e:
 2124                             r = int(s.get('r', 0))
 2125                             ms_info['total_number'] += 1 + r
 2126                             ms_info['s'].append({
 2127                                 't': int(s.get('t', 0)),
 2128                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
 2129                                 'd': int(s.attrib['d']),
 2130                                 'r': r,
 2131                             })
 2132                 start_number = source.get('startNumber')
 2133                 if start_number:
 2134                     ms_info['start_number'] = int(start_number)
 2135                 timescale = source.get('timescale')
 2136                 if timescale:
 2137                     ms_info['timescale'] = int(timescale)
 2138                 segment_duration = source.get('duration')
 2139                 if segment_duration:
 2140                     ms_info['segment_duration'] = float(segment_duration)
 2141 
 2142             def extract_Initialization(source):
 2143                 initialization = source.find(_add_ns('Initialization'))
 2144                 if initialization is not None:
 2145                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
 2146 
 2147             segment_list = element.find(_add_ns('SegmentList'))
 2148             if segment_list is not None:
 2149                 extract_common(segment_list)
 2150                 extract_Initialization(segment_list)
 2151                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
 2152                 if segment_urls_e:
 2153                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
 2154             else:
 2155                 segment_template = element.find(_add_ns('SegmentTemplate'))
 2156                 if segment_template is not None:
 2157                     extract_common(segment_template)
 2158                     media = segment_template.get('media')
 2159                     if media:
 2160                         ms_info['media'] = media
 2161                     initialization = segment_template.get('initialization')
 2162                     if initialization:
 2163                         ms_info['initialization'] = initialization
 2164                     else:
 2165                         extract_Initialization(segment_template)
 2166             return ms_info
 2167 
 2168         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
 2169         formats = []
 2170         for period in mpd_doc.findall(_add_ns('Period')):
 2171             period_duration = parse_duration(period.get('duration')) or mpd_duration
 2172             period_ms_info = extract_multisegment_info(period, {
 2173                 'start_number': 1,
 2174                 'timescale': 1,
 2175             })
 2176             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
 2177                 if is_drm_protected(adaptation_set):
 2178                     continue
 2179                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
 2180                 for representation in adaptation_set.findall(_add_ns('Representation')):
 2181                     if is_drm_protected(representation):
 2182                         continue
 2183                     representation_attrib = adaptation_set.attrib.copy()
 2184                     representation_attrib.update(representation.attrib)
 2185                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
 2186                     mime_type = representation_attrib['mimeType']
 2187                     content_type = mime_type.split('/')[0]
 2188                     if content_type == 'text':
 2189                         # TODO implement WebVTT downloading
 2190                         pass
 2191                     elif content_type in ('video', 'audio'):
 2192                         base_url = ''
 2193                         for element in (representation, adaptation_set, period, mpd_doc):
 2194                             base_url_e = element.find(_add_ns('BaseURL'))
 2195                             if base_url_e is not None:
 2196                                 base_url = base_url_e.text + base_url
 2197                                 if re.match(r'^https?://', base_url):
 2198                                     break
 2199                         if mpd_base_url and not re.match(r'^https?://', base_url):
 2200                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
 2201                                 mpd_base_url += '/'
 2202                             base_url = mpd_base_url + base_url
 2203                         representation_id = representation_attrib.get('id')
 2204                         lang = representation_attrib.get('lang')
 2205                         url_el = representation.find(_add_ns('BaseURL'))
 2206                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
 2207                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
 2208                         f = {
 2209                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
 2210                             'manifest_url': mpd_url,
 2211                             'ext': mimetype2ext(mime_type),
 2212                             'width': int_or_none(representation_attrib.get('width')),
 2213                             'height': int_or_none(representation_attrib.get('height')),
 2214                             'tbr': float_or_none(bandwidth, 1000),
 2215                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
 2216                             'fps': int_or_none(representation_attrib.get('frameRate')),
 2217                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
 2218                             'format_note': 'DASH %s' % content_type,
 2219                             'filesize': filesize,
 2220                             'container': mimetype2ext(mime_type) + '_dash',
 2221                         }
 2222                         f.update(parse_codecs(representation_attrib.get('codecs')))
 2223                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
 2224 
 2225                         def prepare_template(template_name, identifiers):
 2226                             tmpl = representation_ms_info[template_name]
 2227                             # First of, % characters outside $...$ templates
 2228                             # must be escaped by doubling for proper processing
 2229                             # by % operator string formatting used further (see
 2230                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
 2231                             t = ''
 2232                             in_template = False
 2233                             for c in tmpl:
 2234                                 t += c
 2235                                 if c == '$':
 2236                                     in_template = not in_template
 2237                                 elif c == '%' and not in_template:
 2238                                     t += c
 2239                             # Next, $...$ templates are translated to their
 2240                             # %(...) counterparts to be used with % operator
 2241                             t = t.replace('$RepresentationID$', representation_id)
 2242                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
 2243                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
 2244                             t.replace('$$', '$')
 2245                             return t
 2246 
 2247                         # @initialization is a regular template like @media one
 2248                         # so it should be handled just the same way (see
 2249                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
 2250                         if 'initialization' in representation_ms_info:
 2251                             initialization_template = prepare_template(
 2252                                 'initialization',
 2253                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
 2254                                 # $Time$ shall not be included for @initialization thus
 2255                                 # only $Bandwidth$ remains
 2256                                 ('Bandwidth', ))
 2257                             representation_ms_info['initialization_url'] = initialization_template % {
 2258                                 'Bandwidth': bandwidth,
 2259                             }
 2260 
 2261                         def location_key(location):
 2262                             return 'url' if re.match(r'^https?://', location) else 'path'
 2263 
 2264                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
 2265 
 2266                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
 2267                             media_location_key = location_key(media_template)
 2268 
 2269                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
 2270                             # can't be used at the same time
 2271                             if '%(Number' in media_template and 's' not in representation_ms_info:
 2272                                 segment_duration = None
 2273                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
 2274                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
 2275                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
 2276                                 representation_ms_info['fragments'] = [{
 2277                                     media_location_key: media_template % {
 2278                                         'Number': segment_number,
 2279                                         'Bandwidth': bandwidth,
 2280                                     },
 2281                                     'duration': segment_duration,
 2282                                 } for segment_number in range(
 2283                                     representation_ms_info['start_number'],
 2284                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
 2285                             else:
 2286                                 # $Number*$ or $Time$ in media template with S list available
 2287                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
 2288                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
 2289                                 representation_ms_info['fragments'] = []
 2290                                 segment_time = 0
 2291                                 segment_d = None
 2292                                 segment_number = representation_ms_info['start_number']
 2293 
 2294                                 def add_segment_url():
 2295                                     segment_url = media_template % {
 2296                                         'Time': segment_time,
 2297                                         'Bandwidth': bandwidth,
 2298                                         'Number': segment_number,
 2299                                     }
 2300                                     representation_ms_info['fragments'].append({
 2301                                         media_location_key: segment_url,
 2302                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
 2303                                     })
 2304 
 2305                                 for num, s in enumerate(representation_ms_info['s']):
 2306                                     segment_time = s.get('t') or segment_time
 2307                                     segment_d = s['d']
 2308                                     add_segment_url()
 2309                                     segment_number += 1
 2310                                     for r in range(s.get('r', 0)):
 2311                                         segment_time += segment_d
 2312                                         add_segment_url()
 2313                                         segment_number += 1
 2314                                     segment_time += segment_d
 2315                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
 2316                             # No media template
 2317                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
 2318                             # or any YouTube dashsegments video
 2319                             fragments = []
 2320                             segment_index = 0
 2321                             timescale = representation_ms_info['timescale']
 2322                             for s in representation_ms_info['s']:
 2323                                 duration = float_or_none(s['d'], timescale)
 2324                                 for r in range(s.get('r', 0) + 1):
 2325                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
 2326                                     fragments.append({
 2327                                         location_key(segment_uri): segment_uri,
 2328                                         'duration': duration,
 2329                                     })
 2330                                     segment_index += 1
 2331                             representation_ms_info['fragments'] = fragments
 2332                         elif 'segment_urls' in representation_ms_info:
 2333                             # Segment URLs with no SegmentTimeline
 2334                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
 2335                             # https://github.com/ytdl-org/youtube-dl/pull/14844
 2336                             fragments = []
 2337                             segment_duration = float_or_none(
 2338                                 representation_ms_info['segment_duration'],
 2339                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
 2340                             for segment_url in representation_ms_info['segment_urls']:
 2341                                 fragment = {
 2342                                     location_key(segment_url): segment_url,
 2343                                 }
 2344                                 if segment_duration:
 2345                                     fragment['duration'] = segment_duration
 2346                                 fragments.append(fragment)
 2347                             representation_ms_info['fragments'] = fragments
 2348                         # If there is a fragments key available then we correctly recognized fragmented media.
 2349                         # Otherwise we will assume unfragmented media with direct access. Technically, such
 2350                         # assumption is not necessarily correct since we may simply have no support for
 2351                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
 2352                         if 'fragments' in representation_ms_info:
 2353                             f.update({
 2354                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
 2355                                 'url': mpd_url or base_url,
 2356                                 'fragment_base_url': base_url,
 2357                                 'fragments': [],
 2358                                 'protocol': 'http_dash_segments',
 2359                             })
 2360                             if 'initialization_url' in representation_ms_info:
 2361                                 initialization_url = representation_ms_info['initialization_url']
 2362                                 if not f.get('url'):
 2363                                     f['url'] = initialization_url
 2364                                 f['fragments'].append({location_key(initialization_url): initialization_url})
 2365                             f['fragments'].extend(representation_ms_info['fragments'])
 2366                         else:
 2367                             # Assuming direct URL to unfragmented media.
 2368                             f['url'] = base_url
 2369                         formats.append(f)
 2370                     else:
 2371                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
 2372         return formats
 2373 
 2374     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 2375         res = self._download_xml_handle(
 2376             ism_url, video_id,
 2377             note=note or 'Downloading ISM manifest',
 2378             errnote=errnote or 'Failed to download ISM manifest',
 2379             fatal=fatal, data=data, headers=headers, query=query)
 2380         if res is False:
 2381             return []
 2382         ism_doc, urlh = res
 2383         if ism_doc is None:
 2384             return []
 2385 
 2386         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
 2387 
 2388     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
 2389         """
 2390         Parse formats from ISM manifest.
 2391         References:
 2392          1. [MS-SSTR]: Smooth Streaming Protocol,
 2393             https://msdn.microsoft.com/en-us/library/ff469518.aspx
 2394         """
 2395         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
 2396             return []
 2397 
 2398         duration = int(ism_doc.attrib['Duration'])
 2399         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
 2400 
 2401         formats = []
 2402         for stream in ism_doc.findall('StreamIndex'):
 2403             stream_type = stream.get('Type')
 2404             if stream_type not in ('video', 'audio'):
 2405                 continue
 2406             url_pattern = stream.attrib['Url']
 2407             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
 2408             stream_name = stream.get('Name')
 2409             for track in stream.findall('QualityLevel'):
 2410                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
 2411                 # TODO: add support for WVC1 and WMAP
 2412                 if fourcc not in ('H264', 'AVC1', 'AACL'):
 2413                     self.report_warning('%s is not a supported codec' % fourcc)
 2414                     continue
 2415                 tbr = int(track.attrib['Bitrate']) // 1000
 2416                 # [1] does not mention Width and Height attributes. However,
 2417                 # they're often present while MaxWidth and MaxHeight are
 2418                 # missing, so should be used as fallbacks
 2419                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
 2420                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
 2421                 sampling_rate = int_or_none(track.get('SamplingRate'))
 2422 
 2423                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
 2424                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
 2425 
 2426                 fragments = []
 2427                 fragment_ctx = {
 2428                     'time': 0,
 2429                 }
 2430                 stream_fragments = stream.findall('c')
 2431                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
 2432                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
 2433                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
 2434                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
 2435                     if not fragment_ctx['duration']:
 2436                         try:
 2437                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
 2438                         except IndexError:
 2439                             next_fragment_time = duration
 2440                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
 2441                     for _ in range(fragment_repeat):
 2442                         fragments.append({
 2443                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
 2444                             'duration': fragment_ctx['duration'] / stream_timescale,
 2445                         })
 2446                         fragment_ctx['time'] += fragment_ctx['duration']
 2447 
 2448                 format_id = []
 2449                 if ism_id:
 2450                     format_id.append(ism_id)
 2451                 if stream_name:
 2452                     format_id.append(stream_name)
 2453                 format_id.append(compat_str(tbr))
 2454 
 2455                 formats.append({
 2456                     'format_id': '-'.join(format_id),
 2457                     'url': ism_url,
 2458                     'manifest_url': ism_url,
 2459                     'ext': 'ismv' if stream_type == 'video' else 'isma',
 2460                     'width': width,
 2461                     'height': height,
 2462                     'tbr': tbr,
 2463                     'asr': sampling_rate,
 2464                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
 2465                     'acodec': 'none' if stream_type == 'video' else fourcc,
 2466                     'protocol': 'ism',
 2467                     'fragments': fragments,
 2468                     '_download_params': {
 2469                         'duration': duration,
 2470                         'timescale': stream_timescale,
 2471                         'width': width or 0,
 2472                         'height': height or 0,
 2473                         'fourcc': fourcc,
 2474                         'codec_private_data': track.get('CodecPrivateData'),
 2475                         'sampling_rate': sampling_rate,
 2476                         'channels': int_or_none(track.get('Channels', 2)),
 2477                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
 2478                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
 2479                     },
 2480                 })
 2481         return formats
 2482 
 2483     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
 2484         def absolute_url(item_url):
 2485             return urljoin(base_url, item_url)
 2486 
 2487         def parse_content_type(content_type):
 2488             if not content_type:
 2489                 return {}
 2490             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
 2491             if ctr:
 2492                 mimetype, codecs = ctr.groups()
 2493                 f = parse_codecs(codecs)
 2494                 f['ext'] = mimetype2ext(mimetype)
 2495                 return f
 2496             return {}
 2497 
 2498         def _media_formats(src, cur_media_type, type_info={}):
 2499             full_url = absolute_url(src)
 2500             ext = type_info.get('ext') or determine_ext(full_url)
 2501             if ext == 'm3u8':
 2502                 is_plain_url = False
 2503                 formats = self._extract_m3u8_formats(
 2504                     full_url, video_id, ext='mp4',
 2505                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
 2506                     preference=preference, fatal=False)
 2507             elif ext == 'mpd':
 2508                 is_plain_url = False
 2509                 formats = self._extract_mpd_formats(
 2510                     full_url, video_id, mpd_id=mpd_id, fatal=False)
 2511             else:
 2512                 is_plain_url = True
 2513                 formats = [{
 2514                     'url': full_url,
 2515                     'vcodec': 'none' if cur_media_type == 'audio' else None,
 2516                 }]
 2517             return is_plain_url, formats
 2518 
 2519         entries = []
 2520         # amp-video and amp-audio are very similar to their HTML5 counterparts
 2521         # so we wll include them right here (see
 2522         # https://www.ampproject.org/docs/reference/components/amp-video)
 2523         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
 2524         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
 2525         media_tags = [(media_tag, media_tag_name, media_type, '')
 2526                       for media_tag, media_tag_name, media_type
 2527                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
 2528         media_tags.extend(re.findall(
 2529             # We only allow video|audio followed by a whitespace or '>'.
 2530             # Allowing more characters may end up in significant slow down (see
 2531             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
 2532             # http://www.porntrex.com/maps/videositemap.xml).
 2533             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
 2534         for media_tag, _, media_type, media_content in media_tags:
 2535             media_info = {
 2536                 'formats': [],
 2537                 'subtitles': {},
 2538             }
 2539             media_attributes = extract_attributes(media_tag)
 2540             src = strip_or_none(media_attributes.get('src'))
 2541             if src:
 2542                 _, formats = _media_formats(src, media_type)
 2543                 media_info['formats'].extend(formats)
 2544             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
 2545             if media_content:
 2546                 for source_tag in re.findall(r'<source[^>]+>', media_content):
 2547                     s_attr = extract_attributes(source_tag)
 2548                     # data-video-src and data-src are non standard but seen
 2549                     # several times in the wild
 2550                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
 2551                     if not src:
 2552                         continue
 2553                     f = parse_content_type(s_attr.get('type'))
 2554                     is_plain_url, formats = _media_formats(src, media_type, f)
 2555                     if is_plain_url:
 2556                         # width, height, res, label and title attributes are
 2557                         # all not standard but seen several times in the wild
 2558                         labels = [
 2559                             s_attr.get(lbl)
 2560                             for lbl in ('label', 'title')
 2561                             if str_or_none(s_attr.get(lbl))
 2562                         ]
 2563                         width = int_or_none(s_attr.get('width'))
 2564                         height = (int_or_none(s_attr.get('height'))
 2565                                   or int_or_none(s_attr.get('res')))
 2566                         if not width or not height:
 2567                             for lbl in labels:
 2568                                 resolution = parse_resolution(lbl)
 2569                                 if not resolution:
 2570                                     continue
 2571                                 width = width or resolution.get('width')
 2572                                 height = height or resolution.get('height')
 2573                         for lbl in labels:
 2574                             tbr = parse_bitrate(lbl)
 2575                             if tbr:
 2576                                 break
 2577                         else:
 2578                             tbr = None
 2579                         f.update({
 2580                             'width': width,
 2581                             'height': height,
 2582                             'tbr': tbr,
 2583                             'format_id': s_attr.get('label') or s_attr.get('title'),
 2584                         })
 2585                         f.update(formats[0])
 2586                         media_info['formats'].append(f)
 2587                     else:
 2588                         media_info['formats'].extend(formats)
 2589                 for track_tag in re.findall(r'<track[^>]+>', media_content):
 2590                     track_attributes = extract_attributes(track_tag)
 2591                     kind = track_attributes.get('kind')
 2592                     if not kind or kind in ('subtitles', 'captions'):
 2593                         src = strip_or_none(track_attributes.get('src'))
 2594                         if not src:
 2595                             continue
 2596                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
 2597                         media_info['subtitles'].setdefault(lang, []).append({
 2598                             'url': absolute_url(src),
 2599                         })
 2600             for f in media_info['formats']:
 2601                 f.setdefault('http_headers', {})['Referer'] = base_url
 2602             if media_info['formats'] or media_info['subtitles']:
 2603                 entries.append(media_info)
 2604         return entries
 2605 
 2606     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
 2607         signed = 'hdnea=' in manifest_url
 2608         if not signed:
 2609             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
 2610             manifest_url = re.sub(
 2611                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
 2612                 '', manifest_url).strip('?')
 2613 
 2614         formats = []
 2615 
 2616         hdcore_sign = 'hdcore=3.7.0'
 2617         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
 2618         hds_host = hosts.get('hds')
 2619         if hds_host:
 2620             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
 2621         if 'hdcore=' not in f4m_url:
 2622             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
 2623         f4m_formats = self._extract_f4m_formats(
 2624             f4m_url, video_id, f4m_id='hds', fatal=False)
 2625         for entry in f4m_formats:
 2626             entry.update({'extra_param_to_segment_url': hdcore_sign})
 2627         formats.extend(f4m_formats)
 2628 
 2629         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
 2630         hls_host = hosts.get('hls')
 2631         if hls_host:
 2632             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
 2633         m3u8_formats = self._extract_m3u8_formats(
 2634             m3u8_url, video_id, 'mp4', 'm3u8_native',
 2635             m3u8_id='hls', fatal=False)
 2636         formats.extend(m3u8_formats)
 2637 
 2638         http_host = hosts.get('http')
 2639         if http_host and m3u8_formats and not signed:
 2640             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
 2641             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
 2642             qualities_length = len(qualities)
 2643             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
 2644                 i = 0
 2645                 for f in m3u8_formats:
 2646                     if f['vcodec'] != 'none':
 2647                         for protocol in ('http', 'https'):
 2648                             http_f = f.copy()
 2649                             del http_f['manifest_url']
 2650                             http_url = re.sub(
 2651                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
 2652                             http_f.update({
 2653                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
 2654                                 'url': http_url,
 2655                                 'protocol': protocol,
 2656                             })
 2657                             formats.append(http_f)
 2658                         i += 1
 2659 
 2660         return formats
 2661 
 2662     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
 2663         query = compat_urlparse.urlparse(url).query
 2664         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
 2665         mobj = re.search(
 2666             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
 2667         url_base = mobj.group('url')
 2668         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
 2669         formats = []
 2670 
 2671         def manifest_url(manifest):
 2672             m_url = '%s/%s' % (http_base_url, manifest)
 2673             if query:
 2674                 m_url += '?%s' % query
 2675             return m_url
 2676 
 2677         if 'm3u8' not in skip_protocols:
 2678             formats.extend(self._extract_m3u8_formats(
 2679                 manifest_url('playlist.m3u8'), video_id, 'mp4',
 2680                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
 2681         if 'f4m' not in skip_protocols:
 2682             formats.extend(self._extract_f4m_formats(
 2683                 manifest_url('manifest.f4m'),
 2684                 video_id, f4m_id='hds', fatal=False))
 2685         if 'dash' not in skip_protocols:
 2686             formats.extend(self._extract_mpd_formats(
 2687                 manifest_url('manifest.mpd'),
 2688                 video_id, mpd_id='dash', fatal=False))
 2689         if re.search(r'(?:/smil:|\.smil)', url_base):
 2690             if 'smil' not in skip_protocols:
 2691                 rtmp_formats = self._extract_smil_formats(
 2692                     manifest_url('jwplayer.smil'),
 2693                     video_id, fatal=False)
 2694                 for rtmp_format in rtmp_formats:
 2695                     rtsp_format = rtmp_format.copy()
 2696                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
 2697                     del rtsp_format['play_path']
 2698                     del rtsp_format['ext']
 2699                     rtsp_format.update({
 2700                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
 2701                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
 2702                         'protocol': 'rtsp',
 2703                     })
 2704                     formats.extend([rtmp_format, rtsp_format])
 2705         else:
 2706             for protocol in ('rtmp', 'rtsp'):
 2707                 if protocol not in skip_protocols:
 2708                     formats.append({
 2709                         'url': '%s:%s' % (protocol, url_base),
 2710                         'format_id': protocol,
 2711                         'protocol': protocol,
 2712                     })
 2713         return formats
 2714 
 2715     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
 2716         mobj = re.search(
 2717             r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
 2718             webpage)
 2719         if mobj:
 2720             try:
 2721                 jwplayer_data = self._parse_json(mobj.group('options'),
 2722                                                  video_id=video_id,
 2723                                                  transform_source=transform_source)
 2724             except ExtractorError:
 2725                 pass
 2726             else:
 2727                 if isinstance(jwplayer_data, dict):
 2728                     return jwplayer_data
 2729 
 2730     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
 2731         jwplayer_data = self._find_jwplayer_data(
 2732             webpage, video_id, transform_source=js_to_json)
 2733         return self._parse_jwplayer_data(
 2734             jwplayer_data, video_id, *args, **kwargs)
 2735 
 2736     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
 2737                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
 2738         flat_pl = try_get(jwplayer_data, lambda x: x.get('playlist') or True)
 2739         if flat_pl is None:
 2740             # not even a dict
 2741             return []
 2742 
 2743         # JWPlayer backward compatibility: flattened playlists
 2744         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
 2745         if flat_pl is True:
 2746             jwplayer_data = {'playlist': [jwplayer_data]}
 2747 
 2748         entries = []
 2749 
 2750         # JWPlayer backward compatibility: single playlist item
 2751         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
 2752         if not isinstance(jwplayer_data['playlist'], list):
 2753             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
 2754 
 2755         for video_data in jwplayer_data['playlist']:
 2756             # JWPlayer backward compatibility: flattened sources
 2757             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
 2758             if 'sources' not in video_data:
 2759                 video_data['sources'] = [video_data]
 2760 
 2761             this_video_id = video_id or video_data['mediaid']
 2762 
 2763             formats = self._parse_jwplayer_formats(
 2764                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
 2765                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
 2766 
 2767             subtitles = {}
 2768             tracks = video_data.get('tracks')
 2769             if tracks and isinstance(tracks, list):
 2770                 for track in tracks:
 2771                     if not isinstance(track, dict):
 2772                         continue
 2773                     track_kind = track.get('kind')
 2774                     if not track_kind or not isinstance(track_kind, compat_str):
 2775                         continue
 2776                     if track_kind.lower() not in ('captions', 'subtitles'):
 2777                         continue
 2778                     track_url = urljoin(base_url, track.get('file'))
 2779                     if not track_url:
 2780                         continue
 2781                     subtitles.setdefault(track.get('label') or 'en', []).append({
 2782                         'url': self._proto_relative_url(track_url)
 2783                     })
 2784 
 2785             entry = {
 2786                 'id': this_video_id,
 2787                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
 2788                 'description': clean_html(video_data.get('description')),
 2789                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
 2790                 'timestamp': int_or_none(video_data.get('pubdate')),
 2791                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
 2792                 'subtitles': subtitles,
 2793                 'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
 2794                 'genre': clean_html(video_data.get('genre')),
 2795                 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
 2796                 'season_number': int_or_none(video_data.get('season')),
 2797                 'episode_number': int_or_none(video_data.get('episode')),
 2798                 'release_year': int_or_none(video_data.get('releasedate')),
 2799                 'age_limit': int_or_none(video_data.get('age_restriction')),
 2800             }
 2801             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
 2802             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
 2803                 entry.update({
 2804                     '_type': 'url_transparent',
 2805                     'url': formats[0]['url'],
 2806                 })
 2807             else:
 2808                 # avoid exception in case of only sttls
 2809                 if formats:
 2810                     self._sort_formats(formats)
 2811                 entry['formats'] = formats
 2812             entries.append(entry)
 2813         if len(entries) == 1:
 2814             return entries[0]
 2815         else:
 2816             return self.playlist_result(entries)
 2817 
 2818     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
 2819                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
 2820         urls = set()
 2821         formats = []
 2822         for source in jwplayer_sources_data:
 2823             if not isinstance(source, dict):
 2824                 continue
 2825             source_url = urljoin(
 2826                 base_url, self._proto_relative_url(source.get('file')))
 2827             if not source_url or source_url in urls:
 2828                 continue
 2829             urls.add(source_url)
 2830             source_type = source.get('type') or ''
 2831             ext = mimetype2ext(source_type) or determine_ext(source_url)
 2832             if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
 2833                 formats.extend(self._extract_m3u8_formats(
 2834                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
 2835                     m3u8_id=m3u8_id, fatal=False))
 2836             elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
 2837                 formats.extend(self._extract_mpd_formats(
 2838                     source_url, video_id, mpd_id=mpd_id, fatal=False))
 2839             elif ext == 'smil':
 2840                 formats.extend(self._extract_smil_formats(
 2841                     source_url, video_id, fatal=False))
 2842             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
 2843             elif source_type.startswith('audio') or ext in (
 2844                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
 2845                 formats.append({
 2846                     'url': source_url,
 2847                     'vcodec': 'none',
 2848                     'ext': ext,
 2849                 })
 2850             else:
 2851                 format_id = str_or_none(source.get('label'))
 2852                 height = int_or_none(source.get('height'))
 2853                 if height is None and format_id:
 2854                     # Often no height is provided but there is a label in
 2855                     # format like "1080p", "720p SD", or 1080.
 2856                     height = parse_resolution(format_id).get('height')
 2857                 a_format = {
 2858                     'url': source_url,
 2859                     'width': int_or_none(source.get('width')),
 2860                     'height': height,
 2861                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
 2862                     'filesize': int_or_none(source.get('filesize')),
 2863                     'ext': ext,
 2864                 }
 2865                 if format_id:
 2866                     a_format['format_id'] = format_id
 2867 
 2868                 if source_url.startswith('rtmp'):
 2869                     a_format['ext'] = 'flv'
 2870                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
 2871                     # of jwplayer.flash.swf
 2872                     rtmp_url_parts = re.split(
 2873                         r'((?:mp4|mp3|flv):)', source_url, 1)
 2874                     if len(rtmp_url_parts) == 3:
 2875                         rtmp_url, prefix, play_path = rtmp_url_parts
 2876                         a_format.update({
 2877                             'url': rtmp_url,
 2878                             'play_path': prefix + play_path,
 2879                         })
 2880                     if rtmp_params:
 2881                         a_format.update(rtmp_params)
 2882                 formats.append(a_format)
 2883         return formats
 2884 
 2885     def _live_title(self, name):
 2886         """ Generate the title for a live video """
 2887         now = datetime.datetime.now()
 2888         now_str = now.strftime('%Y-%m-%d %H:%M')
 2889         return name + ' ' + now_str
 2890 
 2891     def _int(self, v, name, fatal=False, **kwargs):
 2892         res = int_or_none(v, **kwargs)
 2893         if 'get_attr' in kwargs:
 2894             print(getattr(v, kwargs['get_attr']))
 2895         if res is None:
 2896             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 2897             if fatal:
 2898                 raise ExtractorError(msg)
 2899             else:
 2900                 self._downloader.report_warning(msg)
 2901         return res
 2902 
 2903     def _float(self, v, name, fatal=False, **kwargs):
 2904         res = float_or_none(v, **kwargs)
 2905         if res is None:
 2906             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 2907             if fatal:
 2908                 raise ExtractorError(msg)
 2909             else:
 2910                 self._downloader.report_warning(msg)
 2911         return res
 2912 
 2913     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
 2914                     path='/', secure=False, discard=False, rest={}, **kwargs):
 2915         cookie = compat_cookiejar_Cookie(
 2916             0, name, value, port, port is not None, domain, True,
 2917             domain.startswith('.'), path, True, secure, expire_time,
 2918             discard, None, None, rest)
 2919         self._downloader.cookiejar.set_cookie(cookie)
 2920 
 2921     def _get_cookies(self, url):
 2922         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
 2923         req = sanitized_Request(url)
 2924         self._downloader.cookiejar.add_cookie_header(req)
 2925         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
 2926 
 2927     def _apply_first_set_cookie_header(self, url_handle, cookie):
 2928         """
 2929         Apply first Set-Cookie header instead of the last. Experimental.
 2930 
 2931         Some sites (e.g. [1-3]) may serve two cookies under the same name
 2932         in Set-Cookie header and expect the first (old) one to be set rather
 2933         than second (new). However, as of RFC6265 the newer one cookie
 2934         should be set into cookie store what actually happens.
 2935         We will workaround this issue by resetting the cookie to
 2936         the first one manually.
 2937         1. https://new.vk.com/
 2938         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
 2939         3. https://learning.oreilly.com/
 2940         """
 2941         for header, cookies in url_handle.headers.items():
 2942             if header.lower() != 'set-cookie':
 2943                 continue
 2944             if sys.version_info[0] >= 3:
 2945                 cookies = cookies.encode('iso-8859-1')
 2946             cookies = cookies.decode('utf-8')
 2947             cookie_value = re.search(
 2948                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
 2949             if cookie_value:
 2950                 value, domain = cookie_value.groups()
 2951                 self._set_cookie(domain, cookie, value)
 2952                 break
 2953 
 2954     def get_testcases(self, include_onlymatching=False):
 2955         t = getattr(self, '_TEST', None)
 2956         if t:
 2957             assert not hasattr(self, '_TESTS'), \
 2958                 '%s has _TEST and _TESTS' % type(self).__name__
 2959             tests = [t]
 2960         else:
 2961             tests = getattr(self, '_TESTS', [])
 2962         for t in tests:
 2963             if not include_onlymatching and t.get('only_matching', False):
 2964                 continue
 2965             t['name'] = type(self).__name__[:-len('IE')]
 2966             yield t
 2967 
 2968     def is_suitable(self, age_limit):
 2969         """ Test whether the extractor is generally suitable for the given
 2970         age limit (i.e. pornographic sites are not, all others usually are) """
 2971 
 2972         any_restricted = False
 2973         for tc in self.get_testcases(include_onlymatching=False):
 2974             if tc.get('playlist', []):
 2975                 tc = tc['playlist'][0]
 2976             is_restricted = age_restricted(
 2977                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 2978             if not is_restricted:
 2979                 return True
 2980             any_restricted = any_restricted or is_restricted
 2981         return not any_restricted
 2982 
 2983     def extract_subtitles(self, *args, **kwargs):
 2984         if (self._downloader.params.get('writesubtitles', False)
 2985                 or self._downloader.params.get('listsubtitles')):
 2986             return self._get_subtitles(*args, **kwargs)
 2987         return {}
 2988 
 2989     def _get_subtitles(self, *args, **kwargs):
 2990         raise NotImplementedError('This method must be implemented by subclasses')
 2991 
 2992     @staticmethod
 2993     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
 2994         """ Merge subtitle items for one language. Items with duplicated URLs
 2995         will be dropped. """
 2996         list1_urls = set([item['url'] for item in subtitle_list1])
 2997         ret = list(subtitle_list1)
 2998         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
 2999         return ret
 3000 
 3001     @classmethod
 3002     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
 3003         """ Merge two subtitle dictionaries, language by language. """
 3004         ret = dict(subtitle_dict1)
 3005         for lang in subtitle_dict2:
 3006             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
 3007         return ret
 3008 
 3009     def extract_automatic_captions(self, *args, **kwargs):
 3010         if (self._downloader.params.get('writeautomaticsub', False)
 3011                 or self._downloader.params.get('listsubtitles')):
 3012             return self._get_automatic_captions(*args, **kwargs)
 3013         return {}
 3014 
 3015     def _get_automatic_captions(self, *args, **kwargs):
 3016         raise NotImplementedError('This method must be implemented by subclasses')
 3017 
 3018     def mark_watched(self, *args, **kwargs):
 3019         if (self._downloader.params.get('mark_watched', False)
 3020                 and (self._get_login_info()[0] is not None
 3021                      or self._downloader.params.get('cookiefile') is not None)):
 3022             self._mark_watched(*args, **kwargs)
 3023 
 3024     def _mark_watched(self, *args, **kwargs):
 3025         raise NotImplementedError('This method must be implemented by subclasses')
 3026 
 3027     def geo_verification_headers(self):
 3028         headers = {}
 3029         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
 3030         if geo_verification_proxy:
 3031             headers['Ytdl-request-proxy'] = geo_verification_proxy
 3032         return headers
 3033 
 3034     def _generic_id(self, url):
 3035         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 3036 
 3037     def _generic_title(self, url):
 3038         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
 3039 
 3040 
 3041 class SearchInfoExtractor(InfoExtractor):
 3042     """
 3043     Base class for paged search queries extractors.
 3044     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
 3045     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 3046     """
 3047 
 3048     @classmethod
 3049     def _make_valid_url(cls):
 3050         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 3051 
 3052     @classmethod
 3053     def suitable(cls, url):
 3054         return re.match(cls._make_valid_url(), url) is not None
 3055 
 3056     def _real_extract(self, query):
 3057         mobj = re.match(self._make_valid_url(), query)
 3058         if mobj is None:
 3059             raise ExtractorError('Invalid search query "%s"' % query)
 3060 
 3061         prefix = mobj.group('prefix')
 3062         query = mobj.group('query')
 3063         if prefix == '':
 3064             return self._get_n_results(query, 1)
 3065         elif prefix == 'all':
 3066             return self._get_n_results(query, self._MAX_RESULTS)
 3067         else:
 3068             n = int(prefix)
 3069             if n <= 0:
 3070                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 3071             elif n > self._MAX_RESULTS:
 3072                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 3073                 n = self._MAX_RESULTS
 3074             return self._get_n_results(query, n)
 3075 
 3076     def _get_n_results(self, query, n):
 3077         """Get a specified number of results for a query"""
 3078         raise NotImplementedError('This method must be implemented by subclasses')
 3079 
 3080     @property
 3081     def SEARCH_KEY(self):
 3082         return self._SEARCH_KEY

Generated by cgit