summaryrefslogtreecommitdiff
path: root/youtube_dl/utils.py
blob: 904f23fd7747f5fd1957ab268713513da1526d5c (plain)
    1 #!/usr/bin/env python
    2 # -*- coding: utf-8 -*-
    3 
    4 from __future__ import unicode_literals
    5 
    6 import base64
    7 import binascii
    8 import calendar
    9 import codecs
   10 import contextlib
   11 import ctypes
   12 import datetime
   13 import email.utils
   14 import errno
   15 import functools
   16 import gzip
   17 import io
   18 import itertools
   19 import json
   20 import locale
   21 import math
   22 import operator
   23 import os
   24 import pipes
   25 import platform
   26 import re
   27 import socket
   28 import ssl
   29 import subprocess
   30 import sys
   31 import tempfile
   32 import traceback
   33 import xml.etree.ElementTree
   34 import zlib
   35 
   36 from .compat import (
   37     compat_HTMLParser,
   38     compat_basestring,
   39     compat_chr,
   40     compat_etree_fromstring,
   41     compat_html_entities,
   42     compat_html_entities_html5,
   43     compat_http_client,
   44     compat_kwargs,
   45     compat_parse_qs,
   46     compat_shlex_quote,
   47     compat_socket_create_connection,
   48     compat_str,
   49     compat_struct_pack,
   50     compat_struct_unpack,
   51     compat_urllib_error,
   52     compat_urllib_parse,
   53     compat_urllib_parse_urlencode,
   54     compat_urllib_parse_urlparse,
   55     compat_urllib_parse_unquote_plus,
   56     compat_urllib_request,
   57     compat_urlparse,
   58     compat_xpath,
   59 )
   60 
   61 from .socks import (
   62     ProxyType,
   63     sockssocket,
   64 )
   65 
   66 
   67 def register_socks_protocols():
   68     # "Register" SOCKS protocols
   69     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
   70     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
   71     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
   72         if scheme not in compat_urlparse.uses_netloc:
   73             compat_urlparse.uses_netloc.append(scheme)
   74 
   75 
   76 # This is not clearly defined otherwise
   77 compiled_regex_type = type(re.compile(''))
   78 
   79 std_headers = {
   80     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
   81     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
   82     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   83     'Accept-Encoding': 'gzip, deflate',
   84     'Accept-Language': 'en-us,en;q=0.5',
   85 }
   86 
   87 
   88 NO_DEFAULT = object()
   89 
   90 ENGLISH_MONTH_NAMES = [
   91     'January', 'February', 'March', 'April', 'May', 'June',
   92     'July', 'August', 'September', 'October', 'November', 'December']
   93 
   94 KNOWN_EXTENSIONS = (
   95     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
   96     'flv', 'f4v', 'f4a', 'f4b',
   97     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
   98     'mkv', 'mka', 'mk3d',
   99     'avi', 'divx',
  100     'mov',
  101     'asf', 'wmv', 'wma',
  102     '3gp', '3g2',
  103     'mp3',
  104     'flac',
  105     'ape',
  106     'wav',
  107     'f4f', 'f4m', 'm3u8', 'smil')
  108 
  109 # needed for sanitizing filenames in restricted mode
  110 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  111                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
  112                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
  113 
  114 DATE_FORMATS = (
  115     '%d %B %Y',
  116     '%d %b %Y',
  117     '%B %d %Y',
  118     '%b %d %Y',
  119     '%b %dst %Y %I:%M',
  120     '%b %dnd %Y %I:%M',
  121     '%b %dth %Y %I:%M',
  122     '%Y %m %d',
  123     '%Y-%m-%d',
  124     '%Y/%m/%d',
  125     '%Y/%m/%d %H:%M',
  126     '%Y/%m/%d %H:%M:%S',
  127     '%Y-%m-%d %H:%M:%S',
  128     '%Y-%m-%d %H:%M:%S.%f',
  129     '%d.%m.%Y %H:%M',
  130     '%d.%m.%Y %H.%M',
  131     '%Y-%m-%dT%H:%M:%SZ',
  132     '%Y-%m-%dT%H:%M:%S.%fZ',
  133     '%Y-%m-%dT%H:%M:%S.%f0Z',
  134     '%Y-%m-%dT%H:%M:%S',
  135     '%Y-%m-%dT%H:%M:%S.%f',
  136     '%Y-%m-%dT%H:%M',
  137 )
  138 
  139 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
  140 DATE_FORMATS_DAY_FIRST.extend([
  141     '%d-%m-%Y',
  142     '%d.%m.%Y',
  143     '%d.%m.%y',
  144     '%d/%m/%Y',
  145     '%d/%m/%y',
  146     '%d/%m/%Y %H:%M:%S',
  147 ])
  148 
  149 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
  150 DATE_FORMATS_MONTH_FIRST.extend([
  151     '%m-%d-%Y',
  152     '%m.%d.%Y',
  153     '%m/%d/%Y',
  154     '%m/%d/%y',
  155     '%m/%d/%Y %H:%M:%S',
  156 ])
  157 
  158 
  159 def preferredencoding():
  160     """Get preferred encoding.
  161 
  162     Returns the best encoding scheme for the system, based on
  163     locale.getpreferredencoding() and some further tweaks.
  164     """
  165     try:
  166         pref = locale.getpreferredencoding()
  167         'TEST'.encode(pref)
  168     except Exception:
  169         pref = 'UTF-8'
  170 
  171     return pref
  172 
  173 
  174 def write_json_file(obj, fn):
  175     """ Encode obj as JSON and write it to fn, atomically if possible """
  176 
  177     fn = encodeFilename(fn)
  178     if sys.version_info < (3, 0) and sys.platform != 'win32':
  179         encoding = get_filesystem_encoding()
  180         # os.path.basename returns a bytes object, but NamedTemporaryFile
  181         # will fail if the filename contains non ascii characters unless we
  182         # use a unicode object
  183         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  184         # the same for os.path.dirname
  185         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  186     else:
  187         path_basename = os.path.basename
  188         path_dirname = os.path.dirname
  189 
  190     args = {
  191         'suffix': '.tmp',
  192         'prefix': path_basename(fn) + '.',
  193         'dir': path_dirname(fn),
  194         'delete': False,
  195     }
  196 
  197     # In Python 2.x, json.dump expects a bytestream.
  198     # In Python 3.x, it writes to a character stream
  199     if sys.version_info < (3, 0):
  200         args['mode'] = 'wb'
  201     else:
  202         args.update({
  203             'mode': 'w',
  204             'encoding': 'utf-8',
  205         })
  206 
  207     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
  208 
  209     try:
  210         with tf:
  211             json.dump(obj, tf)
  212         if sys.platform == 'win32':
  213             # Need to remove existing file on Windows, else os.rename raises
  214             # WindowsError or FileExistsError.
  215             try:
  216                 os.unlink(fn)
  217             except OSError:
  218                 pass
  219         os.rename(tf.name, fn)
  220     except Exception:
  221         try:
  222             os.remove(tf.name)
  223         except OSError:
  224             pass
  225         raise
  226 
  227 
  228 if sys.version_info >= (2, 7):
  229     def find_xpath_attr(node, xpath, key, val=None):
  230         """ Find the xpath xpath[@key=val] """
  231         assert re.match(r'^[a-zA-Z_-]+$', key)
  232         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
  233         return node.find(expr)
  234 else:
  235     def find_xpath_attr(node, xpath, key, val=None):
  236         for f in node.findall(compat_xpath(xpath)):
  237             if key not in f.attrib:
  238                 continue
  239             if val is None or f.attrib.get(key) == val:
  240                 return f
  241         return None
  242 
  243 # On python2.6 the xml.etree.ElementTree.Element methods don't support
  244 # the namespace parameter
  245 
  246 
  247 def xpath_with_ns(path, ns_map):
  248     components = [c.split(':') for c in path.split('/')]
  249     replaced = []
  250     for c in components:
  251         if len(c) == 1:
  252             replaced.append(c[0])
  253         else:
  254             ns, tag = c
  255             replaced.append('{%s}%s' % (ns_map[ns], tag))
  256     return '/'.join(replaced)
  257 
  258 
  259 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  260     def _find_xpath(xpath):
  261         return node.find(compat_xpath(xpath))
  262 
  263     if isinstance(xpath, (str, compat_str)):
  264         n = _find_xpath(xpath)
  265     else:
  266         for xp in xpath:
  267             n = _find_xpath(xp)
  268             if n is not None:
  269                 break
  270 
  271     if n is None:
  272         if default is not NO_DEFAULT:
  273             return default
  274         elif fatal:
  275             name = xpath if name is None else name
  276             raise ExtractorError('Could not find XML element %s' % name)
  277         else:
  278             return None
  279     return n
  280 
  281 
  282 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  283     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
  284     if n is None or n == default:
  285         return n
  286     if n.text is None:
  287         if default is not NO_DEFAULT:
  288             return default
  289         elif fatal:
  290             name = xpath if name is None else name
  291             raise ExtractorError('Could not find XML element\'s text %s' % name)
  292         else:
  293             return None
  294     return n.text
  295 
  296 
  297 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
  298     n = find_xpath_attr(node, xpath, key)
  299     if n is None:
  300         if default is not NO_DEFAULT:
  301             return default
  302         elif fatal:
  303             name = '%s[@%s]' % (xpath, key) if name is None else name
  304             raise ExtractorError('Could not find XML attribute %s' % name)
  305         else:
  306             return None
  307     return n.attrib[key]
  308 
  309 
  310 def get_element_by_id(id, html):
  311     """Return the content of the tag with the specified ID in the passed HTML document"""
  312     return get_element_by_attribute('id', id, html)
  313 
  314 
  315 def get_element_by_class(class_name, html):
  316     return get_element_by_attribute(
  317         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
  318         html, escape_value=False)
  319 
  320 
  321 def get_element_by_attribute(attribute, value, html, escape_value=True):
  322     """Return the content of the tag with the specified attribute in the passed HTML document"""
  323 
  324     value = re.escape(value) if escape_value else value
  325 
  326     m = re.search(r'''(?xs)
  327         <([a-zA-Z0-9:._-]+)
  328          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
  329          \s+%s=['"]?%s['"]?
  330          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
  331         \s*>
  332         (?P<content>.*?)
  333         </\1>
  334     ''' % (re.escape(attribute), value), html)
  335 
  336     if not m:
  337         return None
  338     res = m.group('content')
  339 
  340     if res.startswith('"') or res.startswith("'"):
  341         res = res[1:-1]
  342 
  343     return unescapeHTML(res)
  344 
  345 
  346 class HTMLAttributeParser(compat_HTMLParser):
  347     """Trivial HTML parser to gather the attributes for a single element"""
  348     def __init__(self):
  349         self.attrs = {}
  350         compat_HTMLParser.__init__(self)
  351 
  352     def handle_starttag(self, tag, attrs):
  353         self.attrs = dict(attrs)
  354 
  355 
  356 def extract_attributes(html_element):
  357     """Given a string for an HTML element such as
  358     <el
  359          a="foo" B="bar" c="&98;az" d=boz
  360          empty= noval entity="&amp;"
  361          sq='"' dq="'"
  362     >
  363     Decode and return a dictionary of attributes.
  364     {
  365         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
  366         'empty': '', 'noval': None, 'entity': '&',
  367         'sq': '"', 'dq': '\''
  368     }.
  369     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
  370     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
  371     """
  372     parser = HTMLAttributeParser()
  373     parser.feed(html_element)
  374     parser.close()
  375     return parser.attrs
  376 
  377 
  378 def clean_html(html):
  379     """Clean an HTML snippet into a readable string"""
  380 
  381     if html is None:  # Convenience for sanitizing descriptions etc.
  382         return html
  383 
  384     # Newline vs <br />
  385     html = html.replace('\n', ' ')
  386     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
  387     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
  388     # Strip html tags
  389     html = re.sub('<.*?>', '', html)
  390     # Replace html entities
  391     html = unescapeHTML(html)
  392     return html.strip()
  393 
  394 
  395 def sanitize_open(filename, open_mode):
  396     """Try to open the given filename, and slightly tweak it if this fails.
  397 
  398     Attempts to open the given filename. If this fails, it tries to change
  399     the filename slightly, step by step, until it's either able to open it
  400     or it fails and raises a final exception, like the standard open()
  401     function.
  402 
  403     It returns the tuple (stream, definitive_file_name).
  404     """
  405     try:
  406         if filename == '-':
  407             if sys.platform == 'win32':
  408                 import msvcrt
  409                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  410             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  411         stream = open(encodeFilename(filename), open_mode)
  412         return (stream, filename)
  413     except (IOError, OSError) as err:
  414         if err.errno in (errno.EACCES,):
  415             raise
  416 
  417         # In case of error, try to remove win32 forbidden chars
  418         alt_filename = sanitize_path(filename)
  419         if alt_filename == filename:
  420             raise
  421         else:
  422             # An exception here should be caught in the caller
  423             stream = open(encodeFilename(alt_filename), open_mode)
  424             return (stream, alt_filename)
  425 
  426 
  427 def timeconvert(timestr):
  428     """Convert RFC 2822 defined time string into system timestamp"""
  429     timestamp = None
  430     timetuple = email.utils.parsedate_tz(timestr)
  431     if timetuple is not None:
  432         timestamp = email.utils.mktime_tz(timetuple)
  433     return timestamp
  434 
  435 
  436 def sanitize_filename(s, restricted=False, is_id=False):
  437     """Sanitizes a string so it could be used as part of a filename.
  438     If restricted is set, use a stricter subset of allowed characters.
  439     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
  440     """
  441     def replace_insane(char):
  442         if restricted and char in ACCENT_CHARS:
  443             return ACCENT_CHARS[char]
  444         if char == '?' or ord(char) < 32 or ord(char) == 127:
  445             return ''
  446         elif char == '"':
  447             return '' if restricted else '\''
  448         elif char == ':':
  449             return '_-' if restricted else ' -'
  450         elif char in '\\/|*<>':
  451             return '_'
  452         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
  453             return '_'
  454         if restricted and ord(char) > 127:
  455             return '_'
  456         return char
  457 
  458     # Handle timestamps
  459     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
  460     result = ''.join(map(replace_insane, s))
  461     if not is_id:
  462         while '__' in result:
  463             result = result.replace('__', '_')
  464         result = result.strip('_')
  465         # Common case of "Foreign band name - English song title"
  466         if restricted and result.startswith('-_'):
  467             result = result[2:]
  468         if result.startswith('-'):
  469             result = '_' + result[len('-'):]
  470         result = result.lstrip('.')
  471         if not result:
  472             result = '_'
  473     return result
  474 
  475 
  476 def sanitize_path(s):
  477     """Sanitizes and normalizes path on Windows"""
  478     if sys.platform != 'win32':
  479         return s
  480     drive_or_unc, _ = os.path.splitdrive(s)
  481     if sys.version_info < (2, 7) and not drive_or_unc:
  482         drive_or_unc, _ = os.path.splitunc(s)
  483     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
  484     if drive_or_unc:
  485         norm_path.pop(0)
  486     sanitized_path = [
  487         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
  488         for path_part in norm_path]
  489     if drive_or_unc:
  490         sanitized_path.insert(0, drive_or_unc + os.path.sep)
  491     return os.path.join(*sanitized_path)
  492 
  493 
  494 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
  495 # unwanted failures due to missing protocol
  496 def sanitize_url(url):
  497     return 'http:%s' % url if url.startswith('//') else url
  498 
  499 
  500 def sanitized_Request(url, *args, **kwargs):
  501     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
  502 
  503 
  504 def orderedSet(iterable):
  505     """ Remove all duplicates from the input iterable """
  506     res = []
  507     for el in iterable:
  508         if el not in res:
  509             res.append(el)
  510     return res
  511 
  512 
  513 def _htmlentity_transform(entity_with_semicolon):
  514     """Transforms an HTML entity to a character."""
  515     entity = entity_with_semicolon[:-1]
  516 
  517     # Known non-numeric HTML entity
  518     if entity in compat_html_entities.name2codepoint:
  519         return compat_chr(compat_html_entities.name2codepoint[entity])
  520 
  521     # TODO: HTML5 allows entities without a semicolon. For example,
  522     # '&Eacuteric' should be decoded as 'Éric'.
  523     if entity_with_semicolon in compat_html_entities_html5:
  524         return compat_html_entities_html5[entity_with_semicolon]
  525 
  526     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
  527     if mobj is not None:
  528         numstr = mobj.group(1)
  529         if numstr.startswith('x'):
  530             base = 16
  531             numstr = '0%s' % numstr
  532         else:
  533             base = 10
  534         # See https://github.com/rg3/youtube-dl/issues/7518
  535         try:
  536             return compat_chr(int(numstr, base))
  537         except ValueError:
  538             pass
  539 
  540     # Unknown entity in name, return its literal representation
  541     return '&%s;' % entity
  542 
  543 
  544 def unescapeHTML(s):
  545     if s is None:
  546         return None
  547     assert type(s) == compat_str
  548 
  549     return re.sub(
  550         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  551 
  552 
  553 def get_subprocess_encoding():
  554     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
  555         # For subprocess calls, encode with locale encoding
  556         # Refer to http://stackoverflow.com/a/9951851/35070
  557         encoding = preferredencoding()
  558     else:
  559         encoding = sys.getfilesystemencoding()
  560     if encoding is None:
  561         encoding = 'utf-8'
  562     return encoding
  563 
  564 
  565 def encodeFilename(s, for_subprocess=False):
  566     """
  567     @param s The name of the file
  568     """
  569 
  570     assert type(s) == compat_str
  571 
  572     # Python 3 has a Unicode API
  573     if sys.version_info >= (3, 0):
  574         return s
  575 
  576     # Pass '' directly to use Unicode APIs on Windows 2000 and up
  577     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
  578     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
  579     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
  580         return s
  581 
  582     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
  583     if sys.platform.startswith('java'):
  584         return s
  585 
  586     return s.encode(get_subprocess_encoding(), 'ignore')
  587 
  588 
  589 def decodeFilename(b, for_subprocess=False):
  590 
  591     if sys.version_info >= (3, 0):
  592         return b
  593 
  594     if not isinstance(b, bytes):
  595         return b
  596 
  597     return b.decode(get_subprocess_encoding(), 'ignore')
  598 
  599 
  600 def encodeArgument(s):
  601     if not isinstance(s, compat_str):
  602         # Legacy code that uses byte strings
  603         # Uncomment the following line after fixing all post processors
  604         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
  605         s = s.decode('ascii')
  606     return encodeFilename(s, True)
  607 
  608 
  609 def decodeArgument(b):
  610     return decodeFilename(b, True)
  611 
  612 
  613 def decodeOption(optval):
  614     if optval is None:
  615         return optval
  616     if isinstance(optval, bytes):
  617         optval = optval.decode(preferredencoding())
  618 
  619     assert isinstance(optval, compat_str)
  620     return optval
  621 
  622 
  623 def formatSeconds(secs):
  624     if secs > 3600:
  625         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
  626     elif secs > 60:
  627         return '%d:%02d' % (secs // 60, secs % 60)
  628     else:
  629         return '%d' % secs
  630 
  631 
  632 def make_HTTPS_handler(params, **kwargs):
  633     opts_no_check_certificate = params.get('nocheckcertificate', False)
  634     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
  635         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
  636         if opts_no_check_certificate:
  637             context.check_hostname = False
  638             context.verify_mode = ssl.CERT_NONE
  639         try:
  640             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
  641         except TypeError:
  642             # Python 2.7.8
  643             # (create_default_context present but HTTPSHandler has no context=)
  644             pass
  645 
  646     if sys.version_info < (3, 2):
  647         return YoutubeDLHTTPSHandler(params, **kwargs)
  648     else:  # Python < 3.4
  649         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
  650         context.verify_mode = (ssl.CERT_NONE
  651                                if opts_no_check_certificate
  652                                else ssl.CERT_REQUIRED)
  653         context.set_default_verify_paths()
  654         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
  655 
  656 
  657 def bug_reports_message():
  658     if ytdl_is_updateable():
  659         update_cmd = 'type  youtube-dl -U  to update'
  660     else:
  661         update_cmd = 'see  https://yt-dl.org/update  on how to update'
  662     msg = '; please report this issue on https://yt-dl.org/bug .'
  663     msg += ' Make sure you are using the latest version; %s.' % update_cmd
  664     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
  665     return msg
  666 
  667 
  668 class ExtractorError(Exception):
  669     """Error during info extraction."""
  670 
  671     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
  672         """ tb, if given, is the original traceback (so that it can be printed out).
  673         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
  674         """
  675 
  676         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
  677             expected = True
  678         if video_id is not None:
  679             msg = video_id + ': ' + msg
  680         if cause:
  681             msg += ' (caused by %r)' % cause
  682         if not expected:
  683             msg += bug_reports_message()
  684         super(ExtractorError, self).__init__(msg)
  685 
  686         self.traceback = tb
  687         self.exc_info = sys.exc_info()  # preserve original exception
  688         self.cause = cause
  689         self.video_id = video_id
  690 
  691     def format_traceback(self):
  692         if self.traceback is None:
  693             return None
  694         return ''.join(traceback.format_tb(self.traceback))
  695 
  696 
  697 class UnsupportedError(ExtractorError):
  698     def __init__(self, url):
  699         super(UnsupportedError, self).__init__(
  700             'Unsupported URL: %s' % url, expected=True)
  701         self.url = url
  702 
  703 
  704 class RegexNotFoundError(ExtractorError):
  705     """Error when a regex didn't match"""
  706     pass
  707 
  708 
  709 class DownloadError(Exception):
  710     """Download Error exception.
  711 
  712     This exception may be thrown by FileDownloader objects if they are not
  713     configured to continue on errors. They will contain the appropriate
  714     error message.
  715     """
  716 
  717     def __init__(self, msg, exc_info=None):
  718         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  719         super(DownloadError, self).__init__(msg)
  720         self.exc_info = exc_info
  721 
  722 
  723 class SameFileError(Exception):
  724     """Same File exception.
  725 
  726     This exception will be thrown by FileDownloader objects if they detect
  727     multiple files would have to be downloaded to the same file on disk.
  728     """
  729     pass
  730 
  731 
  732 class PostProcessingError(Exception):
  733     """Post Processing exception.
  734 
  735     This exception may be raised by PostProcessor's .run() method to
  736     indicate an error in the postprocessing task.
  737     """
  738 
  739     def __init__(self, msg):
  740         self.msg = msg
  741 
  742 
  743 class MaxDownloadsReached(Exception):
  744     """ --max-downloads limit has been reached. """
  745     pass
  746 
  747 
  748 class UnavailableVideoError(Exception):
  749     """Unavailable Format exception.
  750 
  751     This exception will be thrown when a video is requested
  752     in a format that is not available for that video.
  753     """
  754     pass
  755 
  756 
  757 class ContentTooShortError(Exception):
  758     """Content Too Short exception.
  759 
  760     This exception may be raised by FileDownloader objects when a file they
  761     download is too small for what the server announced first, indicating
  762     the connection was probably interrupted.
  763     """
  764 
  765     def __init__(self, downloaded, expected):
  766         # Both in bytes
  767         self.downloaded = downloaded
  768         self.expected = expected
  769 
  770 
  771 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
  772     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
  773     # expected HTTP responses to meet HTTP/1.0 or later (see also
  774     # https://github.com/rg3/youtube-dl/issues/6727)
  775     if sys.version_info < (3, 0):
  776         kwargs[b'strict'] = True
  777     hc = http_class(*args, **kwargs)
  778     source_address = ydl_handler._params.get('source_address')
  779     if source_address is not None:
  780         sa = (source_address, 0)
  781         if hasattr(hc, 'source_address'):  # Python 2.7+
  782             hc.source_address = sa
  783         else:  # Python 2.6
  784             def _hc_connect(self, *args, **kwargs):
  785                 sock = compat_socket_create_connection(
  786                     (self.host, self.port), self.timeout, sa)
  787                 if is_https:
  788                     self.sock = ssl.wrap_socket(
  789                         sock, self.key_file, self.cert_file,
  790                         ssl_version=ssl.PROTOCOL_TLSv1)
  791                 else:
  792                     self.sock = sock
  793             hc.connect = functools.partial(_hc_connect, hc)
  794 
  795     return hc
  796 
  797 
  798 def handle_youtubedl_headers(headers):
  799     filtered_headers = headers
  800 
  801     if 'Youtubedl-no-compression' in filtered_headers:
  802         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
  803         del filtered_headers['Youtubedl-no-compression']
  804 
  805     return filtered_headers
  806 
  807 
  808 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
  809     """Handler for HTTP requests and responses.
  810 
  811     This class, when installed with an OpenerDirector, automatically adds
  812     the standard headers to every HTTP request and handles gzipped and
  813     deflated responses from web servers. If compression is to be avoided in
  814     a particular request, the original request in the program code only has
  815     to include the HTTP header "Youtubedl-no-compression", which will be
  816     removed before making the real request.
  817 
  818     Part of this code was copied from:
  819 
  820     http://techknack.net/python-urllib2-handlers/
  821 
  822     Andrew Rowls, the author of that code, agreed to release it to the
  823     public domain.
  824     """
  825 
  826     def __init__(self, params, *args, **kwargs):
  827         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
  828         self._params = params
  829 
  830     def http_open(self, req):
  831         conn_class = compat_http_client.HTTPConnection
  832 
  833         socks_proxy = req.headers.get('Ytdl-socks-proxy')
  834         if socks_proxy:
  835             conn_class = make_socks_conn_class(conn_class, socks_proxy)
  836             del req.headers['Ytdl-socks-proxy']
  837 
  838         return self.do_open(functools.partial(
  839             _create_http_connection, self, conn_class, False),
  840             req)
  841 
  842     @staticmethod
  843     def deflate(data):
  844         try:
  845             return zlib.decompress(data, -zlib.MAX_WBITS)
  846         except zlib.error:
  847             return zlib.decompress(data)
  848 
  849     @staticmethod
  850     def addinfourl_wrapper(stream, headers, url, code):
  851         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
  852             return compat_urllib_request.addinfourl(stream, headers, url, code)
  853         ret = compat_urllib_request.addinfourl(stream, headers, url)
  854         ret.code = code
  855         return ret
  856 
  857     def http_request(self, req):
  858         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
  859         # always respected by websites, some tend to give out URLs with non percent-encoded
  860         # non-ASCII characters (see telemb.py, ard.py [#3412])
  861         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
  862         # To work around aforementioned issue we will replace request's original URL with
  863         # percent-encoded one
  864         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
  865         # the code of this workaround has been moved here from YoutubeDL.urlopen()
  866         url = req.get_full_url()
  867         url_escaped = escape_url(url)
  868 
  869         # Substitute URL if any change after escaping
  870         if url != url_escaped:
  871             req = update_Request(req, url=url_escaped)
  872 
  873         for h, v in std_headers.items():
  874             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
  875             # The dict keys are capitalized because of this bug by urllib
  876             if h.capitalize() not in req.headers:
  877                 req.add_header(h, v)
  878 
  879         req.headers = handle_youtubedl_headers(req.headers)
  880 
  881         if sys.version_info < (2, 7) and '#' in req.get_full_url():
  882             # Python 2.6 is brain-dead when it comes to fragments
  883             req._Request__original = req._Request__original.partition('#')[0]
  884             req._Request__r_type = req._Request__r_type.partition('#')[0]
  885 
  886         return req
  887 
  888     def http_response(self, req, resp):
  889         old_resp = resp
  890         # gzip
  891         if resp.headers.get('Content-encoding', '') == 'gzip':
  892             content = resp.read()
  893             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
  894             try:
  895                 uncompressed = io.BytesIO(gz.read())
  896             except IOError as original_ioerror:
  897                 # There may be junk add the end of the file
  898                 # See http://stackoverflow.com/q/4928560/35070 for details
  899                 for i in range(1, 1024):
  900                     try:
  901                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
  902                         uncompressed = io.BytesIO(gz.read())
  903                     except IOError:
  904                         continue
  905                     break
  906                 else:
  907                     raise original_ioerror
  908             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
  909             resp.msg = old_resp.msg
  910             del resp.headers['Content-encoding']
  911         # deflate
  912         if resp.headers.get('Content-encoding', '') == 'deflate':
  913             gz = io.BytesIO(self.deflate(resp.read()))
  914             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
  915             resp.msg = old_resp.msg
  916             del resp.headers['Content-encoding']
  917         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
  918         # https://github.com/rg3/youtube-dl/issues/6457).
  919         if 300 <= resp.code < 400:
  920             location = resp.headers.get('Location')
  921             if location:
  922                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
  923                 if sys.version_info >= (3, 0):
  924                     location = location.encode('iso-8859-1').decode('utf-8')
  925                 else:
  926                     location = location.decode('utf-8')
  927                 location_escaped = escape_url(location)
  928                 if location != location_escaped:
  929                     del resp.headers['Location']
  930                     if sys.version_info < (3, 0):
  931                         location_escaped = location_escaped.encode('utf-8')
  932                     resp.headers['Location'] = location_escaped
  933         return resp
  934 
  935     https_request = http_request
  936     https_response = http_response
  937 
  938 
  939 def make_socks_conn_class(base_class, socks_proxy):
  940     assert issubclass(base_class, (
  941         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
  942 
  943     url_components = compat_urlparse.urlparse(socks_proxy)
  944     if url_components.scheme.lower() == 'socks5':
  945         socks_type = ProxyType.SOCKS5
  946     elif url_components.scheme.lower() in ('socks', 'socks4'):
  947         socks_type = ProxyType.SOCKS4
  948     elif url_components.scheme.lower() == 'socks4a':
  949         socks_type = ProxyType.SOCKS4A
  950 
  951     def unquote_if_non_empty(s):
  952         if not s:
  953             return s
  954         return compat_urllib_parse_unquote_plus(s)
  955 
  956     proxy_args = (
  957         socks_type,
  958         url_components.hostname, url_components.port or 1080,
  959         True,  # Remote DNS
  960         unquote_if_non_empty(url_components.username),
  961         unquote_if_non_empty(url_components.password),
  962     )
  963 
  964     class SocksConnection(base_class):
  965         def connect(self):
  966             self.sock = sockssocket()
  967             self.sock.setproxy(*proxy_args)
  968             if type(self.timeout) in (int, float):
  969                 self.sock.settimeout(self.timeout)
  970             self.sock.connect((self.host, self.port))
  971 
  972             if isinstance(self, compat_http_client.HTTPSConnection):
  973                 if hasattr(self, '_context'):  # Python > 2.6
  974                     self.sock = self._context.wrap_socket(
  975                         self.sock, server_hostname=self.host)
  976                 else:
  977                     self.sock = ssl.wrap_socket(self.sock)
  978 
  979     return SocksConnection
  980 
  981 
  982 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
  983     def __init__(self, params, https_conn_class=None, *args, **kwargs):
  984         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
  985         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
  986         self._params = params
  987 
  988     def https_open(self, req):
  989         kwargs = {}
  990         conn_class = self._https_conn_class
  991 
  992         if hasattr(self, '_context'):  # python > 2.6
  993             kwargs['context'] = self._context
  994         if hasattr(self, '_check_hostname'):  # python 3.x
  995             kwargs['check_hostname'] = self._check_hostname
  996 
  997         socks_proxy = req.headers.get('Ytdl-socks-proxy')
  998         if socks_proxy:
  999             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 1000             del req.headers['Ytdl-socks-proxy']
 1001 
 1002         return self.do_open(functools.partial(
 1003             _create_http_connection, self, conn_class, True),
 1004             req, **kwargs)
 1005 
 1006 
 1007 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 1008     def __init__(self, cookiejar=None):
 1009         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 1010 
 1011     def http_response(self, request, response):
 1012         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 1013         # characters in Set-Cookie HTTP header of last response (see
 1014         # https://github.com/rg3/youtube-dl/issues/6769).
 1015         # In order to at least prevent crashing we will percent encode Set-Cookie
 1016         # header before HTTPCookieProcessor starts processing it.
 1017         # if sys.version_info < (3, 0) and response.headers:
 1018         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 1019         #         set_cookie = response.headers.get(set_cookie_header)
 1020         #         if set_cookie:
 1021         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 1022         #             if set_cookie != set_cookie_escaped:
 1023         #                 del response.headers[set_cookie_header]
 1024         #                 response.headers[set_cookie_header] = set_cookie_escaped
 1025         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 1026 
 1027     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 1028     https_response = http_response
 1029 
 1030 
 1031 def extract_timezone(date_str):
 1032     m = re.search(
 1033         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 1034         date_str)
 1035     if not m:
 1036         timezone = datetime.timedelta()
 1037     else:
 1038         date_str = date_str[:-len(m.group('tz'))]
 1039         if not m.group('sign'):
 1040             timezone = datetime.timedelta()
 1041         else:
 1042             sign = 1 if m.group('sign') == '+' else -1
 1043             timezone = datetime.timedelta(
 1044                 hours=sign * int(m.group('hours')),
 1045                 minutes=sign * int(m.group('minutes')))
 1046     return timezone, date_str
 1047 
 1048 
 1049 def parse_iso8601(date_str, delimiter='T', timezone=None):
 1050     """ Return a UNIX timestamp from the given date """
 1051 
 1052     if date_str is None:
 1053         return None
 1054 
 1055     date_str = re.sub(r'\.[0-9]+', '', date_str)
 1056 
 1057     if timezone is None:
 1058         timezone, date_str = extract_timezone(date_str)
 1059 
 1060     try:
 1061         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 1062         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 1063         return calendar.timegm(dt.timetuple())
 1064     except ValueError:
 1065         pass
 1066 
 1067 
 1068 def date_formats(day_first=True):
 1069     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
 1070 
 1071 
 1072 def unified_strdate(date_str, day_first=True):
 1073     """Return a string with the date in the format YYYYMMDD"""
 1074 
 1075     if date_str is None:
 1076         return None
 1077     upload_date = None
 1078     # Replace commas
 1079     date_str = date_str.replace(',', ' ')
 1080     # Remove AM/PM + timezone
 1081     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 1082     _, date_str = extract_timezone(date_str)
 1083 
 1084     for expression in date_formats(day_first):
 1085         try:
 1086             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 1087         except ValueError:
 1088             pass
 1089     if upload_date is None:
 1090         timetuple = email.utils.parsedate_tz(date_str)
 1091         if timetuple:
 1092             try:
 1093                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 1094             except ValueError:
 1095                 pass
 1096     if upload_date is not None:
 1097         return compat_str(upload_date)
 1098 
 1099 
 1100 def unified_timestamp(date_str, day_first=True):
 1101     if date_str is None:
 1102         return None
 1103 
 1104     date_str = date_str.replace(',', ' ')
 1105 
 1106     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
 1107     timezone, date_str = extract_timezone(date_str)
 1108 
 1109     # Remove AM/PM + timezone
 1110     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 1111 
 1112     for expression in date_formats(day_first):
 1113         try:
 1114             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
 1115             return calendar.timegm(dt.timetuple())
 1116         except ValueError:
 1117             pass
 1118     timetuple = email.utils.parsedate_tz(date_str)
 1119     if timetuple:
 1120         return calendar.timegm(timetuple) + pm_delta * 3600
 1121 
 1122 
 1123 def determine_ext(url, default_ext='unknown_video'):
 1124     if url is None:
 1125         return default_ext
 1126     guess = url.partition('?')[0].rpartition('.')[2]
 1127     if re.match(r'^[A-Za-z0-9]+$', guess):
 1128         return guess
 1129     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
 1130     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
 1131         return guess.rstrip('/')
 1132     else:
 1133         return default_ext
 1134 
 1135 
 1136 def subtitles_filename(filename, sub_lang, sub_format):
 1137     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 1138 
 1139 
 1140 def date_from_str(date_str):
 1141     """
 1142     Return a datetime object from a string in the format YYYYMMDD or
 1143     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 1144     today = datetime.date.today()
 1145     if date_str in ('now', 'today'):
 1146         return today
 1147     if date_str == 'yesterday':
 1148         return today - datetime.timedelta(days=1)
 1149     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 1150     if match is not None:
 1151         sign = match.group('sign')
 1152         time = int(match.group('time'))
 1153         if sign == '-':
 1154             time = -time
 1155         unit = match.group('unit')
 1156         # A bad approximation?
 1157         if unit == 'month':
 1158             unit = 'day'
 1159             time *= 30
 1160         elif unit == 'year':
 1161             unit = 'day'
 1162             time *= 365
 1163         unit += 's'
 1164         delta = datetime.timedelta(**{unit: time})
 1165         return today + delta
 1166     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
 1167 
 1168 
 1169 def hyphenate_date(date_str):
 1170     """
 1171     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 1172     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 1173     if match is not None:
 1174         return '-'.join(match.groups())
 1175     else:
 1176         return date_str
 1177 
 1178 
 1179 class DateRange(object):
 1180     """Represents a time interval between two dates"""
 1181 
 1182     def __init__(self, start=None, end=None):
 1183         """start and end must be strings in the format accepted by date"""
 1184         if start is not None:
 1185             self.start = date_from_str(start)
 1186         else:
 1187             self.start = datetime.datetime.min.date()
 1188         if end is not None:
 1189             self.end = date_from_str(end)
 1190         else:
 1191             self.end = datetime.datetime.max.date()
 1192         if self.start > self.end:
 1193             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 1194 
 1195     @classmethod
 1196     def day(cls, day):
 1197         """Returns a range that only contains the given day"""
 1198         return cls(day, day)
 1199 
 1200     def __contains__(self, date):
 1201         """Check if the date is in the range"""
 1202         if not isinstance(date, datetime.date):
 1203             date = date_from_str(date)
 1204         return self.start <= date <= self.end
 1205 
 1206     def __str__(self):
 1207         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 1208 
 1209 
 1210 def platform_name():
 1211     """ Returns the platform name as a compat_str """
 1212     res = platform.platform()
 1213     if isinstance(res, bytes):
 1214         res = res.decode(preferredencoding())
 1215 
 1216     assert isinstance(res, compat_str)
 1217     return res
 1218 
 1219 
 1220 def _windows_write_string(s, out):
 1221     """ Returns True if the string was written using special methods,
 1222     False if it has yet to be written out."""
 1223     # Adapted from http://stackoverflow.com/a/3259271/35070
 1224 
 1225     import ctypes
 1226     import ctypes.wintypes
 1227 
 1228     WIN_OUTPUT_IDS = {
 1229         1: -11,
 1230         2: -12,
 1231     }
 1232 
 1233     try:
 1234         fileno = out.fileno()
 1235     except AttributeError:
 1236         # If the output stream doesn't have a fileno, it's virtual
 1237         return False
 1238     except io.UnsupportedOperation:
 1239         # Some strange Windows pseudo files?
 1240         return False
 1241     if fileno not in WIN_OUTPUT_IDS:
 1242         return False
 1243 
 1244     GetStdHandle = ctypes.WINFUNCTYPE(
 1245         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 1246         (b'GetStdHandle', ctypes.windll.kernel32))
 1247     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 1248 
 1249     WriteConsoleW = ctypes.WINFUNCTYPE(
 1250         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 1251         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 1252         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
 1253     written = ctypes.wintypes.DWORD(0)
 1254 
 1255     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
 1256     FILE_TYPE_CHAR = 0x0002
 1257     FILE_TYPE_REMOTE = 0x8000
 1258     GetConsoleMode = ctypes.WINFUNCTYPE(
 1259         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 1260         ctypes.POINTER(ctypes.wintypes.DWORD))(
 1261         (b'GetConsoleMode', ctypes.windll.kernel32))
 1262     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 1263 
 1264     def not_a_console(handle):
 1265         if handle == INVALID_HANDLE_VALUE or handle is None:
 1266             return True
 1267         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 1268                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 1269 
 1270     if not_a_console(h):
 1271         return False
 1272 
 1273     def next_nonbmp_pos(s):
 1274         try:
 1275             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 1276         except StopIteration:
 1277             return len(s)
 1278 
 1279     while s:
 1280         count = min(next_nonbmp_pos(s), 1024)
 1281 
 1282         ret = WriteConsoleW(
 1283             h, s, count if count else 2, ctypes.byref(written), None)
 1284         if ret == 0:
 1285             raise OSError('Failed to write string')
 1286         if not count:  # We just wrote a non-BMP character
 1287             assert written.value == 2
 1288             s = s[1:]
 1289         else:
 1290             assert written.value > 0
 1291             s = s[written.value:]
 1292     return True
 1293 
 1294 
 1295 def write_string(s, out=None, encoding=None):
 1296     if out is None:
 1297         out = sys.stderr
 1298     assert type(s) == compat_str
 1299 
 1300     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 1301         if _windows_write_string(s, out):
 1302             return
 1303 
 1304     if ('b' in getattr(out, 'mode', '') or
 1305             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 1306         byt = s.encode(encoding or preferredencoding(), 'ignore')
 1307         out.write(byt)
 1308     elif hasattr(out, 'buffer'):
 1309         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 1310         byt = s.encode(enc, 'ignore')
 1311         out.buffer.write(byt)
 1312     else:
 1313         out.write(s)
 1314     out.flush()
 1315 
 1316 
 1317 def bytes_to_intlist(bs):
 1318     if not bs:
 1319         return []
 1320     if isinstance(bs[0], int):  # Python 3
 1321         return list(bs)
 1322     else:
 1323         return [ord(c) for c in bs]
 1324 
 1325 
 1326 def intlist_to_bytes(xs):
 1327     if not xs:
 1328         return b''
 1329     return compat_struct_pack('%dB' % len(xs), *xs)
 1330 
 1331 
 1332 # Cross-platform file locking
 1333 if sys.platform == 'win32':
 1334     import ctypes.wintypes
 1335     import msvcrt
 1336 
 1337     class OVERLAPPED(ctypes.Structure):
 1338         _fields_ = [
 1339             ('Internal', ctypes.wintypes.LPVOID),
 1340             ('InternalHigh', ctypes.wintypes.LPVOID),
 1341             ('Offset', ctypes.wintypes.DWORD),
 1342             ('OffsetHigh', ctypes.wintypes.DWORD),
 1343             ('hEvent', ctypes.wintypes.HANDLE),
 1344         ]
 1345 
 1346     kernel32 = ctypes.windll.kernel32
 1347     LockFileEx = kernel32.LockFileEx
 1348     LockFileEx.argtypes = [
 1349         ctypes.wintypes.HANDLE,     # hFile
 1350         ctypes.wintypes.DWORD,      # dwFlags
 1351         ctypes.wintypes.DWORD,      # dwReserved
 1352         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 1353         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 1354         ctypes.POINTER(OVERLAPPED)  # Overlapped
 1355     ]
 1356     LockFileEx.restype = ctypes.wintypes.BOOL
 1357     UnlockFileEx = kernel32.UnlockFileEx
 1358     UnlockFileEx.argtypes = [
 1359         ctypes.wintypes.HANDLE,     # hFile
 1360         ctypes.wintypes.DWORD,      # dwReserved
 1361         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 1362         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 1363         ctypes.POINTER(OVERLAPPED)  # Overlapped
 1364     ]
 1365     UnlockFileEx.restype = ctypes.wintypes.BOOL
 1366     whole_low = 0xffffffff
 1367     whole_high = 0x7fffffff
 1368 
 1369     def _lock_file(f, exclusive):
 1370         overlapped = OVERLAPPED()
 1371         overlapped.Offset = 0
 1372         overlapped.OffsetHigh = 0
 1373         overlapped.hEvent = 0
 1374         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 1375         handle = msvcrt.get_osfhandle(f.fileno())
 1376         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 1377                           whole_low, whole_high, f._lock_file_overlapped_p):
 1378             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 1379 
 1380     def _unlock_file(f):
 1381         assert f._lock_file_overlapped_p
 1382         handle = msvcrt.get_osfhandle(f.fileno())
 1383         if not UnlockFileEx(handle, 0,
 1384                             whole_low, whole_high, f._lock_file_overlapped_p):
 1385             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 1386 
 1387 else:
 1388     # Some platforms, such as Jython, is missing fcntl
 1389     try:
 1390         import fcntl
 1391 
 1392         def _lock_file(f, exclusive):
 1393             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 1394 
 1395         def _unlock_file(f):
 1396             fcntl.flock(f, fcntl.LOCK_UN)
 1397     except ImportError:
 1398         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
 1399 
 1400         def _lock_file(f, exclusive):
 1401             raise IOError(UNSUPPORTED_MSG)
 1402 
 1403         def _unlock_file(f):
 1404             raise IOError(UNSUPPORTED_MSG)
 1405 
 1406 
 1407 class locked_file(object):
 1408     def __init__(self, filename, mode, encoding=None):
 1409         assert mode in ['r', 'a', 'w']
 1410         self.f = io.open(filename, mode, encoding=encoding)
 1411         self.mode = mode
 1412 
 1413     def __enter__(self):
 1414         exclusive = self.mode != 'r'
 1415         try:
 1416             _lock_file(self.f, exclusive)
 1417         except IOError:
 1418             self.f.close()
 1419             raise
 1420         return self
 1421 
 1422     def __exit__(self, etype, value, traceback):
 1423         try:
 1424             _unlock_file(self.f)
 1425         finally:
 1426             self.f.close()
 1427 
 1428     def __iter__(self):
 1429         return iter(self.f)
 1430 
 1431     def write(self, *args):
 1432         return self.f.write(*args)
 1433 
 1434     def read(self, *args):
 1435         return self.f.read(*args)
 1436 
 1437 
 1438 def get_filesystem_encoding():
 1439     encoding = sys.getfilesystemencoding()
 1440     return encoding if encoding is not None else 'utf-8'
 1441 
 1442 
 1443 def shell_quote(args):
 1444     quoted_args = []
 1445     encoding = get_filesystem_encoding()
 1446     for a in args:
 1447         if isinstance(a, bytes):
 1448             # We may get a filename encoded with 'encodeFilename'
 1449             a = a.decode(encoding)
 1450         quoted_args.append(pipes.quote(a))
 1451     return ' '.join(quoted_args)
 1452 
 1453 
 1454 def smuggle_url(url, data):
 1455     """ Pass additional data in a URL for internal use. """
 1456 
 1457     url, idata = unsmuggle_url(url, {})
 1458     data.update(idata)
 1459     sdata = compat_urllib_parse_urlencode(
 1460         {'__youtubedl_smuggle': json.dumps(data)})
 1461     return url + '#' + sdata
 1462 
 1463 
 1464 def unsmuggle_url(smug_url, default=None):
 1465     if '#__youtubedl_smuggle' not in smug_url:
 1466         return smug_url, default
 1467     url, _, sdata = smug_url.rpartition('#')
 1468     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
 1469     data = json.loads(jsond)
 1470     return url, data
 1471 
 1472 
 1473 def format_bytes(bytes):
 1474     if bytes is None:
 1475         return 'N/A'
 1476     if type(bytes) is str:
 1477         bytes = float(bytes)
 1478     if bytes == 0.0:
 1479         exponent = 0
 1480     else:
 1481         exponent = int(math.log(bytes, 1024.0))
 1482     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
 1483     converted = float(bytes) / float(1024 ** exponent)
 1484     return '%.2f%s' % (converted, suffix)
 1485 
 1486 
 1487 def lookup_unit_table(unit_table, s):
 1488     units_re = '|'.join(re.escape(u) for u in unit_table)
 1489     m = re.match(
 1490         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
 1491     if not m:
 1492         return None
 1493     num_str = m.group('num').replace(',', '.')
 1494     mult = unit_table[m.group('unit')]
 1495     return int(float(num_str) * mult)
 1496 
 1497 
 1498 def parse_filesize(s):
 1499     if s is None:
 1500         return None
 1501 
 1502     # The lower-case forms are of course incorrect and unofficial,
 1503     # but we support those too
 1504     _UNIT_TABLE = {
 1505         'B': 1,
 1506         'b': 1,
 1507         'bytes': 1,
 1508         'KiB': 1024,
 1509         'KB': 1000,
 1510         'kB': 1024,
 1511         'Kb': 1000,
 1512         'kb': 1000,
 1513         'kilobytes': 1000,
 1514         'kibibytes': 1024,
 1515         'MiB': 1024 ** 2,
 1516         'MB': 1000 ** 2,
 1517         'mB': 1024 ** 2,
 1518         'Mb': 1000 ** 2,
 1519         'mb': 1000 ** 2,
 1520         'megabytes': 1000 ** 2,
 1521         'mebibytes': 1024 ** 2,
 1522         'GiB': 1024 ** 3,
 1523         'GB': 1000 ** 3,
 1524         'gB': 1024 ** 3,
 1525         'Gb': 1000 ** 3,
 1526         'gb': 1000 ** 3,
 1527         'gigabytes': 1000 ** 3,
 1528         'gibibytes': 1024 ** 3,
 1529         'TiB': 1024 ** 4,
 1530         'TB': 1000 ** 4,
 1531         'tB': 1024 ** 4,
 1532         'Tb': 1000 ** 4,
 1533         'tb': 1000 ** 4,
 1534         'terabytes': 1000 ** 4,
 1535         'tebibytes': 1024 ** 4,
 1536         'PiB': 1024 ** 5,
 1537         'PB': 1000 ** 5,
 1538         'pB': 1024 ** 5,
 1539         'Pb': 1000 ** 5,
 1540         'pb': 1000 ** 5,
 1541         'petabytes': 1000 ** 5,
 1542         'pebibytes': 1024 ** 5,
 1543         'EiB': 1024 ** 6,
 1544         'EB': 1000 ** 6,
 1545         'eB': 1024 ** 6,
 1546         'Eb': 1000 ** 6,
 1547         'eb': 1000 ** 6,
 1548         'exabytes': 1000 ** 6,
 1549         'exbibytes': 1024 ** 6,
 1550         'ZiB': 1024 ** 7,
 1551         'ZB': 1000 ** 7,
 1552         'zB': 1024 ** 7,
 1553         'Zb': 1000 ** 7,
 1554         'zb': 1000 ** 7,
 1555         'zettabytes': 1000 ** 7,
 1556         'zebibytes': 1024 ** 7,
 1557         'YiB': 1024 ** 8,
 1558         'YB': 1000 ** 8,
 1559         'yB': 1024 ** 8,
 1560         'Yb': 1000 ** 8,
 1561         'yb': 1000 ** 8,
 1562         'yottabytes': 1000 ** 8,
 1563         'yobibytes': 1024 ** 8,
 1564     }
 1565 
 1566     return lookup_unit_table(_UNIT_TABLE, s)
 1567 
 1568 
 1569 def parse_count(s):
 1570     if s is None:
 1571         return None
 1572 
 1573     s = s.strip()
 1574 
 1575     if re.match(r'^[\d,.]+$', s):
 1576         return str_to_int(s)
 1577 
 1578     _UNIT_TABLE = {
 1579         'k': 1000,
 1580         'K': 1000,
 1581         'm': 1000 ** 2,
 1582         'M': 1000 ** 2,
 1583         'kk': 1000 ** 2,
 1584         'KK': 1000 ** 2,
 1585     }
 1586 
 1587     return lookup_unit_table(_UNIT_TABLE, s)
 1588 
 1589 
 1590 def month_by_name(name):
 1591     """ Return the number of a month by (locale-independently) English name """
 1592 
 1593     try:
 1594         return ENGLISH_MONTH_NAMES.index(name) + 1
 1595     except ValueError:
 1596         return None
 1597 
 1598 
 1599 def month_by_abbreviation(abbrev):
 1600     """ Return the number of a month by (locale-independently) English
 1601         abbreviations """
 1602 
 1603     try:
 1604         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
 1605     except ValueError:
 1606         return None
 1607 
 1608 
 1609 def fix_xml_ampersands(xml_str):
 1610     """Replace all the '&' by '&amp;' in XML"""
 1611     return re.sub(
 1612         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
 1613         '&amp;',
 1614         xml_str)
 1615 
 1616 
 1617 def setproctitle(title):
 1618     assert isinstance(title, compat_str)
 1619 
 1620     # ctypes in Jython is not complete
 1621     # http://bugs.jython.org/issue2148
 1622     if sys.platform.startswith('java'):
 1623         return
 1624 
 1625     try:
 1626         libc = ctypes.cdll.LoadLibrary('libc.so.6')
 1627     except OSError:
 1628         return
 1629     title_bytes = title.encode('utf-8')
 1630     buf = ctypes.create_string_buffer(len(title_bytes))
 1631     buf.value = title_bytes
 1632     try:
 1633         libc.prctl(15, buf, 0, 0, 0)
 1634     except AttributeError:
 1635         return  # Strange libc, just skip this
 1636 
 1637 
 1638 def remove_start(s, start):
 1639     return s[len(start):] if s is not None and s.startswith(start) else s
 1640 
 1641 
 1642 def remove_end(s, end):
 1643     return s[:-len(end)] if s is not None and s.endswith(end) else s
 1644 
 1645 
 1646 def remove_quotes(s):
 1647     if s is None or len(s) < 2:
 1648         return s
 1649     for quote in ('"', "'", ):
 1650         if s[0] == quote and s[-1] == quote:
 1651             return s[1:-1]
 1652     return s
 1653 
 1654 
 1655 def url_basename(url):
 1656     path = compat_urlparse.urlparse(url).path
 1657     return path.strip('/').split('/')[-1]
 1658 
 1659 
 1660 class HEADRequest(compat_urllib_request.Request):
 1661     def get_method(self):
 1662         return 'HEAD'
 1663 
 1664 
 1665 class PUTRequest(compat_urllib_request.Request):
 1666     def get_method(self):
 1667         return 'PUT'
 1668 
 1669 
 1670 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
 1671     if get_attr:
 1672         if v is not None:
 1673             v = getattr(v, get_attr, None)
 1674     if v == '':
 1675         v = None
 1676     if v is None:
 1677         return default
 1678     try:
 1679         return int(v) * invscale // scale
 1680     except ValueError:
 1681         return default
 1682 
 1683 
 1684 def str_or_none(v, default=None):
 1685     return default if v is None else compat_str(v)
 1686 
 1687 
 1688 def str_to_int(int_str):
 1689     """ A more relaxed version of int_or_none """
 1690     if int_str is None:
 1691         return None
 1692     int_str = re.sub(r'[,\.\+]', '', int_str)
 1693     return int(int_str)
 1694 
 1695 
 1696 def float_or_none(v, scale=1, invscale=1, default=None):
 1697     if v is None:
 1698         return default
 1699     try:
 1700         return float(v) * invscale / scale
 1701     except ValueError:
 1702         return default
 1703 
 1704 
 1705 def strip_or_none(v):
 1706     return None if v is None else v.strip()
 1707 
 1708 
 1709 def parse_duration(s):
 1710     if not isinstance(s, compat_basestring):
 1711         return None
 1712 
 1713     s = s.strip()
 1714 
 1715     days, hours, mins, secs, ms = [None] * 5
 1716     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
 1717     if m:
 1718         days, hours, mins, secs, ms = m.groups()
 1719     else:
 1720         m = re.match(
 1721             r'''(?ix)(?:P?T)?
 1722                 (?:
 1723                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
 1724                 )?
 1725                 (?:
 1726                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
 1727                 )?
 1728                 (?:
 1729                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
 1730                 )?
 1731                 (?:
 1732                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
 1733                 )?$''', s)
 1734         if m:
 1735             days, hours, mins, secs, ms = m.groups()
 1736         else:
 1737             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
 1738             if m:
 1739                 hours, mins = m.groups()
 1740             else:
 1741                 return None
 1742 
 1743     duration = 0
 1744     if secs:
 1745         duration += float(secs)
 1746     if mins:
 1747         duration += float(mins) * 60
 1748     if hours:
 1749         duration += float(hours) * 60 * 60
 1750     if days:
 1751         duration += float(days) * 24 * 60 * 60
 1752     if ms:
 1753         duration += float(ms)
 1754     return duration
 1755 
 1756 
 1757 def prepend_extension(filename, ext, expected_real_ext=None):
 1758     name, real_ext = os.path.splitext(filename)
 1759     return (
 1760         '{0}.{1}{2}'.format(name, ext, real_ext)
 1761         if not expected_real_ext or real_ext[1:] == expected_real_ext
 1762         else '{0}.{1}'.format(filename, ext))
 1763 
 1764 
 1765 def replace_extension(filename, ext, expected_real_ext=None):
 1766     name, real_ext = os.path.splitext(filename)
 1767     return '{0}.{1}'.format(
 1768         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
 1769         ext)
 1770 
 1771 
 1772 def check_executable(exe, args=[]):
 1773     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
 1774     args can be a list of arguments for a short output (like -version) """
 1775     try:
 1776         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
 1777     except OSError:
 1778         return False
 1779     return exe
 1780 
 1781 
 1782 def get_exe_version(exe, args=['--version'],
 1783                     version_re=None, unrecognized='present'):
 1784     """ Returns the version of the specified executable,
 1785     or False if the executable is not present """
 1786     try:
 1787         out, _ = subprocess.Popen(
 1788             [encodeArgument(exe)] + args,
 1789             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
 1790     except OSError:
 1791         return False
 1792     if isinstance(out, bytes):  # Python 2.x
 1793         out = out.decode('ascii', 'ignore')
 1794     return detect_exe_version(out, version_re, unrecognized)
 1795 
 1796 
 1797 def detect_exe_version(output, version_re=None, unrecognized='present'):
 1798     assert isinstance(output, compat_str)
 1799     if version_re is None:
 1800         version_re = r'version\s+([-0-9._a-zA-Z]+)'
 1801     m = re.search(version_re, output)
 1802     if m:
 1803         return m.group(1)
 1804     else:
 1805         return unrecognized
 1806 
 1807 
 1808 class PagedList(object):
 1809     def __len__(self):
 1810         # This is only useful for tests
 1811         return len(self.getslice())
 1812 
 1813 
 1814 class OnDemandPagedList(PagedList):
 1815     def __init__(self, pagefunc, pagesize, use_cache=False):
 1816         self._pagefunc = pagefunc
 1817         self._pagesize = pagesize
 1818         self._use_cache = use_cache
 1819         if use_cache:
 1820             self._cache = {}
 1821 
 1822     def getslice(self, start=0, end=None):
 1823         res = []
 1824         for pagenum in itertools.count(start // self._pagesize):
 1825             firstid = pagenum * self._pagesize
 1826             nextfirstid = pagenum * self._pagesize + self._pagesize
 1827             if start >= nextfirstid:
 1828                 continue
 1829 
 1830             page_results = None
 1831             if self._use_cache:
 1832                 page_results = self._cache.get(pagenum)
 1833             if page_results is None:
 1834                 page_results = list(self._pagefunc(pagenum))
 1835             if self._use_cache:
 1836                 self._cache[pagenum] = page_results
 1837 
 1838             startv = (
 1839                 start % self._pagesize
 1840                 if firstid <= start < nextfirstid
 1841                 else 0)
 1842 
 1843             endv = (
 1844                 ((end - 1) % self._pagesize) + 1
 1845                 if (end is not None and firstid <= end <= nextfirstid)
 1846                 else None)
 1847 
 1848             if startv != 0 or endv is not None:
 1849                 page_results = page_results[startv:endv]
 1850             res.extend(page_results)
 1851 
 1852             # A little optimization - if current page is not "full", ie. does
 1853             # not contain page_size videos then we can assume that this page
 1854             # is the last one - there are no more ids on further pages -
 1855             # i.e. no need to query again.
 1856             if len(page_results) + startv < self._pagesize:
 1857                 break
 1858 
 1859             # If we got the whole page, but the next page is not interesting,
 1860             # break out early as well
 1861             if end == nextfirstid:
 1862                 break
 1863         return res
 1864 
 1865 
 1866 class InAdvancePagedList(PagedList):
 1867     def __init__(self, pagefunc, pagecount, pagesize):
 1868         self._pagefunc = pagefunc
 1869         self._pagecount = pagecount
 1870         self._pagesize = pagesize
 1871 
 1872     def getslice(self, start=0, end=None):
 1873         res = []
 1874         start_page = start // self._pagesize
 1875         end_page = (
 1876             self._pagecount if end is None else (end // self._pagesize + 1))
 1877         skip_elems = start - start_page * self._pagesize
 1878         only_more = None if end is None else end - start
 1879         for pagenum in range(start_page, end_page):
 1880             page = list(self._pagefunc(pagenum))
 1881             if skip_elems:
 1882                 page = page[skip_elems:]
 1883                 skip_elems = None
 1884             if only_more is not None:
 1885                 if len(page) < only_more:
 1886                     only_more -= len(page)
 1887                 else:
 1888                     page = page[:only_more]
 1889                     res.extend(page)
 1890                     break
 1891             res.extend(page)
 1892         return res
 1893 
 1894 
 1895 def uppercase_escape(s):
 1896     unicode_escape = codecs.getdecoder('unicode_escape')
 1897     return re.sub(
 1898         r'\\U[0-9a-fA-F]{8}',
 1899         lambda m: unicode_escape(m.group(0))[0],
 1900         s)
 1901 
 1902 
 1903 def lowercase_escape(s):
 1904     unicode_escape = codecs.getdecoder('unicode_escape')
 1905     return re.sub(
 1906         r'\\u[0-9a-fA-F]{4}',
 1907         lambda m: unicode_escape(m.group(0))[0],
 1908         s)
 1909 
 1910 
 1911 def escape_rfc3986(s):
 1912     """Escape non-ASCII characters as suggested by RFC 3986"""
 1913     if sys.version_info < (3, 0) and isinstance(s, compat_str):
 1914         s = s.encode('utf-8')
 1915     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
 1916 
 1917 
 1918 def escape_url(url):
 1919     """Escape URL as suggested by RFC 3986"""
 1920     url_parsed = compat_urllib_parse_urlparse(url)
 1921     return url_parsed._replace(
 1922         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
 1923         path=escape_rfc3986(url_parsed.path),
 1924         params=escape_rfc3986(url_parsed.params),
 1925         query=escape_rfc3986(url_parsed.query),
 1926         fragment=escape_rfc3986(url_parsed.fragment)
 1927     ).geturl()
 1928 
 1929 
 1930 def read_batch_urls(batch_fd):
 1931     def fixup(url):
 1932         if not isinstance(url, compat_str):
 1933             url = url.decode('utf-8', 'replace')
 1934         BOM_UTF8 = '\xef\xbb\xbf'
 1935         if url.startswith(BOM_UTF8):
 1936             url = url[len(BOM_UTF8):]
 1937         url = url.strip()
 1938         if url.startswith(('#', ';', ']')):
 1939             return False
 1940         return url
 1941 
 1942     with contextlib.closing(batch_fd) as fd:
 1943         return [url for url in map(fixup, fd) if url]
 1944 
 1945 
 1946 def urlencode_postdata(*args, **kargs):
 1947     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
 1948 
 1949 
 1950 def update_url_query(url, query):
 1951     if not query:
 1952         return url
 1953     parsed_url = compat_urlparse.urlparse(url)
 1954     qs = compat_parse_qs(parsed_url.query)
 1955     qs.update(query)
 1956     return compat_urlparse.urlunparse(parsed_url._replace(
 1957         query=compat_urllib_parse_urlencode(qs, True)))
 1958 
 1959 
 1960 def update_Request(req, url=None, data=None, headers={}, query={}):
 1961     req_headers = req.headers.copy()
 1962     req_headers.update(headers)
 1963     req_data = data or req.data
 1964     req_url = update_url_query(url or req.get_full_url(), query)
 1965     req_get_method = req.get_method()
 1966     if req_get_method == 'HEAD':
 1967         req_type = HEADRequest
 1968     elif req_get_method == 'PUT':
 1969         req_type = PUTRequest
 1970     else:
 1971         req_type = compat_urllib_request.Request
 1972     new_req = req_type(
 1973         req_url, data=req_data, headers=req_headers,
 1974         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 1975     if hasattr(req, 'timeout'):
 1976         new_req.timeout = req.timeout
 1977     return new_req
 1978 
 1979 
 1980 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
 1981     if isinstance(key_or_keys, (list, tuple)):
 1982         for key in key_or_keys:
 1983             if key not in d or d[key] is None or skip_false_values and not d[key]:
 1984                 continue
 1985             return d[key]
 1986         return default
 1987     return d.get(key_or_keys, default)
 1988 
 1989 
 1990 def try_get(src, getter, expected_type=None):
 1991     try:
 1992         v = getter(src)
 1993     except (AttributeError, KeyError, TypeError, IndexError):
 1994         pass
 1995     else:
 1996         if expected_type is None or isinstance(v, expected_type):
 1997             return v
 1998 
 1999 
 2000 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
 2001     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
 2002 
 2003 
 2004 US_RATINGS = {
 2005     'G': 0,
 2006     'PG': 10,
 2007     'PG-13': 13,
 2008     'R': 16,
 2009     'NC': 18,
 2010 }
 2011 
 2012 
 2013 TV_PARENTAL_GUIDELINES = {
 2014     'TV-Y': 0,
 2015     'TV-Y7': 7,
 2016     'TV-G': 0,
 2017     'TV-PG': 0,
 2018     'TV-14': 14,
 2019     'TV-MA': 17,
 2020 }
 2021 
 2022 
 2023 def parse_age_limit(s):
 2024     if type(s) == int:
 2025         return s if 0 <= s <= 21 else None
 2026     if not isinstance(s, compat_basestring):
 2027         return None
 2028     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
 2029     if m:
 2030         return int(m.group('age'))
 2031     if s in US_RATINGS:
 2032         return US_RATINGS[s]
 2033     return TV_PARENTAL_GUIDELINES.get(s)
 2034 
 2035 
 2036 def strip_jsonp(code):
 2037     return re.sub(
 2038         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
 2039 
 2040 
 2041 def js_to_json(code):
 2042     def fix_kv(m):
 2043         v = m.group(0)
 2044         if v in ('true', 'false', 'null'):
 2045             return v
 2046         elif v.startswith('/*') or v == ',':
 2047             return ""
 2048 
 2049         if v[0] in ("'", '"'):
 2050             v = re.sub(r'(?s)\\.|"', lambda m: {
 2051                 '"': '\\"',
 2052                 "\\'": "'",
 2053                 '\\\n': '',
 2054                 '\\x': '\\u00',
 2055             }.get(m.group(0), m.group(0)), v[1:-1])
 2056 
 2057         INTEGER_TABLE = (
 2058             (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
 2059             (r'^(0+[0-7]+)\s*:?$', 8),
 2060         )
 2061 
 2062         for regex, base in INTEGER_TABLE:
 2063             im = re.match(regex, v)
 2064             if im:
 2065                 i = int(im.group(1), base)
 2066                 return '"%d":' % i if v.endswith(':') else '%d' % i
 2067 
 2068         return '"%s"' % v
 2069 
 2070     return re.sub(r'''(?sx)
 2071         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
 2072         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
 2073         /\*.*?\*/|,(?=\s*[\]}])|
 2074         [a-zA-Z_][.a-zA-Z_0-9]*|
 2075         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
 2076         [0-9]+(?=\s*:)
 2077         ''', fix_kv, code)
 2078 
 2079 
 2080 def qualities(quality_ids):
 2081     """ Get a numeric quality value out of a list of possible values """
 2082     def q(qid):
 2083         try:
 2084             return quality_ids.index(qid)
 2085         except ValueError:
 2086             return -1
 2087     return q
 2088 
 2089 
 2090 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
 2091 
 2092 
 2093 def limit_length(s, length):
 2094     """ Add ellipses to overly long strings """
 2095     if s is None:
 2096         return None
 2097     ELLIPSES = '...'
 2098     if len(s) > length:
 2099         return s[:length - len(ELLIPSES)] + ELLIPSES
 2100     return s
 2101 
 2102 
 2103 def version_tuple(v):
 2104     return tuple(int(e) for e in re.split(r'[-.]', v))
 2105 
 2106 
 2107 def is_outdated_version(version, limit, assume_new=True):
 2108     if not version:
 2109         return not assume_new
 2110     try:
 2111         return version_tuple(version) < version_tuple(limit)
 2112     except ValueError:
 2113         return not assume_new
 2114 
 2115 
 2116 def ytdl_is_updateable():
 2117     """ Returns if youtube-dl can be updated with -U """
 2118     from zipimport import zipimporter
 2119 
 2120     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
 2121 
 2122 
 2123 def args_to_str(args):
 2124     # Get a short string representation for a subprocess command
 2125     return ' '.join(compat_shlex_quote(a) for a in args)
 2126 
 2127 
 2128 def error_to_compat_str(err):
 2129     err_str = str(err)
 2130     # On python 2 error byte string must be decoded with proper
 2131     # encoding rather than ascii
 2132     if sys.version_info[0] < 3:
 2133         err_str = err_str.decode(preferredencoding())
 2134     return err_str
 2135 
 2136 
 2137 def mimetype2ext(mt):
 2138     if mt is None:
 2139         return None
 2140 
 2141     ext = {
 2142         'audio/mp4': 'm4a',
 2143         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
 2144         # it's the most popular one
 2145         'audio/mpeg': 'mp3',
 2146     }.get(mt)
 2147     if ext is not None:
 2148         return ext
 2149 
 2150     _, _, res = mt.rpartition('/')
 2151     res = res.split(';')[0].strip().lower()
 2152 
 2153     return {
 2154         '3gpp': '3gp',
 2155         'smptett+xml': 'tt',
 2156         'srt': 'srt',
 2157         'ttaf+xml': 'dfxp',
 2158         'ttml+xml': 'ttml',
 2159         'vtt': 'vtt',
 2160         'x-flv': 'flv',
 2161         'x-mp4-fragmented': 'mp4',
 2162         'x-ms-wmv': 'wmv',
 2163         'mpegurl': 'm3u8',
 2164         'x-mpegurl': 'm3u8',
 2165         'vnd.apple.mpegurl': 'm3u8',
 2166         'dash+xml': 'mpd',
 2167         'f4m': 'f4m',
 2168         'f4m+xml': 'f4m',
 2169         'hds+xml': 'f4m',
 2170         'vnd.ms-sstr+xml': 'ism',
 2171     }.get(res, res)
 2172 
 2173 
 2174 def parse_codecs(codecs_str):
 2175     # http://tools.ietf.org/html/rfc6381
 2176     if not codecs_str:
 2177         return {}
 2178     splited_codecs = list(filter(None, map(
 2179         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
 2180     vcodec, acodec = None, None
 2181     for full_codec in splited_codecs:
 2182         codec = full_codec.split('.')[0]
 2183         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
 2184             if not vcodec:
 2185                 vcodec = full_codec
 2186         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
 2187             if not acodec:
 2188                 acodec = full_codec
 2189         else:
 2190             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
 2191     if not vcodec and not acodec:
 2192         if len(splited_codecs) == 2:
 2193             return {
 2194                 'vcodec': vcodec,
 2195                 'acodec': acodec,
 2196             }
 2197         elif len(splited_codecs) == 1:
 2198             return {
 2199                 'vcodec': 'none',
 2200                 'acodec': vcodec,
 2201             }
 2202     else:
 2203         return {
 2204             'vcodec': vcodec or 'none',
 2205             'acodec': acodec or 'none',
 2206         }
 2207     return {}
 2208 
 2209 
 2210 def urlhandle_detect_ext(url_handle):
 2211     getheader = url_handle.headers.get
 2212 
 2213     cd = getheader('Content-Disposition')
 2214     if cd:
 2215         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
 2216         if m:
 2217             e = determine_ext(m.group('filename'), default_ext=None)
 2218             if e:
 2219                 return e
 2220 
 2221     return mimetype2ext(getheader('Content-Type'))
 2222 
 2223 
 2224 def encode_data_uri(data, mime_type):
 2225     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
 2226 
 2227 
 2228 def age_restricted(content_limit, age_limit):
 2229     """ Returns True iff the content should be blocked """
 2230 
 2231     if age_limit is None:  # No limit set
 2232         return False
 2233     if content_limit is None:
 2234         return False  # Content available for everyone
 2235     return age_limit < content_limit
 2236 
 2237 
 2238 def is_html(first_bytes):
 2239     """ Detect whether a file contains HTML by examining its first bytes. """
 2240 
 2241     BOMS = [
 2242         (b'\xef\xbb\xbf', 'utf-8'),
 2243         (b'\x00\x00\xfe\xff', 'utf-32-be'),
 2244         (b'\xff\xfe\x00\x00', 'utf-32-le'),
 2245         (b'\xff\xfe', 'utf-16-le'),
 2246         (b'\xfe\xff', 'utf-16-be'),
 2247     ]
 2248     for bom, enc in BOMS:
 2249         if first_bytes.startswith(bom):
 2250             s = first_bytes[len(bom):].decode(enc, 'replace')
 2251             break
 2252     else:
 2253         s = first_bytes.decode('utf-8', 'replace')
 2254 
 2255     return re.match(r'^\s*<', s)
 2256 
 2257 
 2258 def determine_protocol(info_dict):
 2259     protocol = info_dict.get('protocol')
 2260     if protocol is not None:
 2261         return protocol
 2262 
 2263     url = info_dict['url']
 2264     if url.startswith('rtmp'):
 2265         return 'rtmp'
 2266     elif url.startswith('mms'):
 2267         return 'mms'
 2268     elif url.startswith('rtsp'):
 2269         return 'rtsp'
 2270 
 2271     ext = determine_ext(url)
 2272     if ext == 'm3u8':
 2273         return 'm3u8'
 2274     elif ext == 'f4m':
 2275         return 'f4m'
 2276 
 2277     return compat_urllib_parse_urlparse(url).scheme
 2278 
 2279 
 2280 def render_table(header_row, data):
 2281     """ Render a list of rows, each as a list of values """
 2282     table = [header_row] + data
 2283     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
 2284     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
 2285     return '\n'.join(format_str % tuple(row) for row in table)
 2286 
 2287 
 2288 def _match_one(filter_part, dct):
 2289     COMPARISON_OPERATORS = {
 2290         '<': operator.lt,
 2291         '<=': operator.le,
 2292         '>': operator.gt,
 2293         '>=': operator.ge,
 2294         '=': operator.eq,
 2295         '!=': operator.ne,
 2296     }
 2297     operator_rex = re.compile(r'''(?x)\s*
 2298         (?P<key>[a-z_]+)
 2299         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
 2300         (?:
 2301             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
 2302             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
 2303         )
 2304         \s*$
 2305         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
 2306     m = operator_rex.search(filter_part)
 2307     if m:
 2308         op = COMPARISON_OPERATORS[m.group('op')]
 2309         if m.group('strval') is not None:
 2310             if m.group('op') not in ('=', '!='):
 2311                 raise ValueError(
 2312                     'Operator %s does not support string values!' % m.group('op'))
 2313             comparison_value = m.group('strval')
 2314         else:
 2315             try:
 2316                 comparison_value = int(m.group('intval'))
 2317             except ValueError:
 2318                 comparison_value = parse_filesize(m.group('intval'))
 2319                 if comparison_value is None:
 2320                     comparison_value = parse_filesize(m.group('intval') + 'B')
 2321                 if comparison_value is None:
 2322                     raise ValueError(
 2323                         'Invalid integer value %r in filter part %r' % (
 2324                             m.group('intval'), filter_part))
 2325         actual_value = dct.get(m.group('key'))
 2326         if actual_value is None:
 2327             return m.group('none_inclusive')
 2328         return op(actual_value, comparison_value)
 2329 
 2330     UNARY_OPERATORS = {
 2331         '': lambda v: v is not None,
 2332         '!': lambda v: v is None,
 2333     }
 2334     operator_rex = re.compile(r'''(?x)\s*
 2335         (?P<op>%s)\s*(?P<key>[a-z_]+)
 2336         \s*$
 2337         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
 2338     m = operator_rex.search(filter_part)
 2339     if m:
 2340         op = UNARY_OPERATORS[m.group('op')]
 2341         actual_value = dct.get(m.group('key'))
 2342         return op(actual_value)
 2343 
 2344     raise ValueError('Invalid filter part %r' % filter_part)
 2345 
 2346 
 2347 def match_str(filter_str, dct):
 2348     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
 2349 
 2350     return all(
 2351         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
 2352 
 2353 
 2354 def match_filter_func(filter_str):
 2355     def _match_func(info_dict):
 2356         if match_str(filter_str, info_dict):
 2357             return None
 2358         else:
 2359             video_title = info_dict.get('title', info_dict.get('id', 'video'))
 2360             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
 2361     return _match_func
 2362 
 2363 
 2364 def parse_dfxp_time_expr(time_expr):
 2365     if not time_expr:
 2366         return
 2367 
 2368     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
 2369     if mobj:
 2370         return float(mobj.group('time_offset'))
 2371 
 2372     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
 2373     if mobj:
 2374         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
 2375 
 2376 
 2377 def srt_subtitles_timecode(seconds):
 2378     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
 2379 
 2380 
 2381 def dfxp2srt(dfxp_data):
 2382     _x = functools.partial(xpath_with_ns, ns_map={
 2383         'ttml': 'http://www.w3.org/ns/ttml',
 2384         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
 2385         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
 2386     })
 2387 
 2388     class TTMLPElementParser(object):
 2389         out = ''
 2390 
 2391         def start(self, tag, attrib):
 2392             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
 2393                 self.out += '\n'
 2394 
 2395         def end(self, tag):
 2396             pass
 2397 
 2398         def data(self, data):
 2399             self.out += data
 2400 
 2401         def close(self):
 2402             return self.out.strip()
 2403 
 2404     def parse_node(node):
 2405         target = TTMLPElementParser()
 2406         parser = xml.etree.ElementTree.XMLParser(target=target)
 2407         parser.feed(xml.etree.ElementTree.tostring(node))
 2408         return parser.close()
 2409 
 2410     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
 2411     out = []
 2412     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
 2413 
 2414     if not paras:
 2415         raise ValueError('Invalid dfxp/TTML subtitle')
 2416 
 2417     for para, index in zip(paras, itertools.count(1)):
 2418         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
 2419         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
 2420         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
 2421         if begin_time is None:
 2422             continue
 2423         if not end_time:
 2424             if not dur:
 2425                 continue
 2426             end_time = begin_time + dur
 2427         out.append('%d\n%s --> %s\n%s\n\n' % (
 2428             index,
 2429             srt_subtitles_timecode(begin_time),
 2430             srt_subtitles_timecode(end_time),
 2431             parse_node(para)))
 2432 
 2433     return ''.join(out)
 2434 
 2435 
 2436 def cli_option(params, command_option, param):
 2437     param = params.get(param)
 2438     if param:
 2439         param = compat_str(param)
 2440     return [command_option, param] if param is not None else []
 2441 
 2442 
 2443 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
 2444     param = params.get(param)
 2445     assert isinstance(param, bool)
 2446     if separator:
 2447         return [command_option + separator + (true_value if param else false_value)]
 2448     return [command_option, true_value if param else false_value]
 2449 
 2450 
 2451 def cli_valueless_option(params, command_option, param, expected_value=True):
 2452     param = params.get(param)
 2453     return [command_option] if param == expected_value else []
 2454 
 2455 
 2456 def cli_configuration_args(params, param, default=[]):
 2457     ex_args = params.get(param)
 2458     if ex_args is None:
 2459         return default
 2460     assert isinstance(ex_args, list)
 2461     return ex_args
 2462 
 2463 
 2464 class ISO639Utils(object):
 2465     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
 2466     _lang_map = {
 2467         'aa': 'aar',
 2468         'ab': 'abk',
 2469         'ae': 'ave',
 2470         'af': 'afr',
 2471         'ak': 'aka',
 2472         'am': 'amh',
 2473         'an': 'arg',
 2474         'ar': 'ara',
 2475         'as': 'asm',
 2476         'av': 'ava',
 2477         'ay': 'aym',
 2478         'az': 'aze',
 2479         'ba': 'bak',
 2480         'be': 'bel',
 2481         'bg': 'bul',
 2482         'bh': 'bih',
 2483         'bi': 'bis',
 2484         'bm': 'bam',
 2485         'bn': 'ben',
 2486         'bo': 'bod',
 2487         'br': 'bre',
 2488         'bs': 'bos',
 2489         'ca': 'cat',
 2490         'ce': 'che',
 2491         'ch': 'cha',
 2492         'co': 'cos',
 2493         'cr': 'cre',
 2494         'cs': 'ces',
 2495         'cu': 'chu',
 2496         'cv': 'chv',
 2497         'cy': 'cym',
 2498         'da': 'dan',
 2499         'de': 'deu',
 2500         'dv': 'div',
 2501         'dz': 'dzo',
 2502         'ee': 'ewe',
 2503         'el': 'ell',
 2504         'en': 'eng',
 2505         'eo': 'epo',
 2506         'es': 'spa',
 2507         'et': 'est',
 2508         'eu': 'eus',
 2509         'fa': 'fas',
 2510         'ff': 'ful',
 2511         'fi': 'fin',
 2512         'fj': 'fij',
 2513         'fo': 'fao',
 2514         'fr': 'fra',
 2515         'fy': 'fry',
 2516         'ga': 'gle',
 2517         'gd': 'gla',
 2518         'gl': 'glg',
 2519         'gn': 'grn',
 2520         'gu': 'guj',
 2521         'gv': 'glv',
 2522         'ha': 'hau',
 2523         'he': 'heb',
 2524         'hi': 'hin',
 2525         'ho': 'hmo',
 2526         'hr': 'hrv',
 2527         'ht': 'hat',
 2528         'hu': 'hun',
 2529         'hy': 'hye',
 2530         'hz': 'her',
 2531         'ia': 'ina',
 2532         'id': 'ind',
 2533         'ie': 'ile',
 2534         'ig': 'ibo',
 2535         'ii': 'iii',
 2536         'ik': 'ipk',
 2537         'io': 'ido',
 2538         'is': 'isl',
 2539         'it': 'ita',
 2540         'iu': 'iku',
 2541         'ja': 'jpn',
 2542         'jv': 'jav',
 2543         'ka': 'kat',
 2544         'kg': 'kon',
 2545         'ki': 'kik',
 2546         'kj': 'kua',
 2547         'kk': 'kaz',
 2548         'kl': 'kal',
 2549         'km': 'khm',
 2550         'kn': 'kan',
 2551         'ko': 'kor',
 2552         'kr': 'kau',
 2553         'ks': 'kas',
 2554         'ku': 'kur',
 2555         'kv': 'kom',
 2556         'kw': 'cor',
 2557         'ky': 'kir',
 2558         'la': 'lat',
 2559         'lb': 'ltz',
 2560         'lg': 'lug',
 2561         'li': 'lim',
 2562         'ln': 'lin',
 2563         'lo': 'lao',
 2564         'lt': 'lit',
 2565         'lu': 'lub',
 2566         'lv': 'lav',
 2567         'mg': 'mlg',
 2568         'mh': 'mah',
 2569         'mi': 'mri',
 2570         'mk': 'mkd',
 2571         'ml': 'mal',
 2572         'mn': 'mon',
 2573         'mr': 'mar',
 2574         'ms': 'msa',
 2575         'mt': 'mlt',
 2576         'my': 'mya',
 2577         'na': 'nau',
 2578         'nb': 'nob',
 2579         'nd': 'nde',
 2580         'ne': 'nep',
 2581         'ng': 'ndo',
 2582         'nl': 'nld',
 2583         'nn': 'nno',
 2584         'no': 'nor',
 2585         'nr': 'nbl',
 2586         'nv': 'nav',
 2587         'ny': 'nya',
 2588         'oc': 'oci',
 2589         'oj': 'oji',
 2590         'om': 'orm',
 2591         'or': 'ori',
 2592         'os': 'oss',
 2593         'pa': 'pan',
 2594         'pi': 'pli',
 2595         'pl': 'pol',
 2596         'ps': 'pus',
 2597         'pt': 'por',
 2598         'qu': 'que',
 2599         'rm': 'roh',
 2600         'rn': 'run',
 2601         'ro': 'ron',
 2602         'ru': 'rus',
 2603         'rw': 'kin',
 2604         'sa': 'san',
 2605         'sc': 'srd',
 2606         'sd': 'snd',
 2607         'se': 'sme',
 2608         'sg': 'sag',
 2609         'si': 'sin',
 2610         'sk': 'slk',
 2611         'sl': 'slv',
 2612         'sm': 'smo',
 2613         'sn': 'sna',
 2614         'so': 'som',
 2615         'sq': 'sqi',
 2616         'sr': 'srp',
 2617         'ss': 'ssw',
 2618         'st': 'sot',
 2619         'su': 'sun',
 2620         'sv': 'swe',
 2621         'sw': 'swa',
 2622         'ta': 'tam',
 2623         'te': 'tel',
 2624         'tg': 'tgk',
 2625         'th': 'tha',
 2626         'ti': 'tir',
 2627         'tk': 'tuk',
 2628         'tl': 'tgl',
 2629         'tn': 'tsn',
 2630         'to': 'ton',
 2631         'tr': 'tur',
 2632         'ts': 'tso',
 2633         'tt': 'tat',
 2634         'tw': 'twi',
 2635         'ty': 'tah',
 2636         'ug': 'uig',
 2637         'uk': 'ukr',
 2638         'ur': 'urd',
 2639         'uz': 'uzb',
 2640         've': 'ven',
 2641         'vi': 'vie',
 2642         'vo': 'vol',
 2643         'wa': 'wln',
 2644         'wo': 'wol',
 2645         'xh': 'xho',
 2646         'yi': 'yid',
 2647         'yo': 'yor',
 2648         'za': 'zha',
 2649         'zh': 'zho',
 2650         'zu': 'zul',
 2651     }
 2652 
 2653     @classmethod
 2654     def short2long(cls, code):
 2655         """Convert language code from ISO 639-1 to ISO 639-2/T"""
 2656         return cls._lang_map.get(code[:2])
 2657 
 2658     @classmethod
 2659     def long2short(cls, code):
 2660         """Convert language code from ISO 639-2/T to ISO 639-1"""
 2661         for short_name, long_name in cls._lang_map.items():
 2662             if long_name == code:
 2663                 return short_name
 2664 
 2665 
 2666 class ISO3166Utils(object):
 2667     # From http://data.okfn.org/data/core/country-list
 2668     _country_map = {
 2669         'AF': 'Afghanistan',
 2670         'AX': 'Åland Islands',
 2671         'AL': 'Albania',
 2672         'DZ': 'Algeria',
 2673         'AS': 'American Samoa',
 2674         'AD': 'Andorra',
 2675         'AO': 'Angola',
 2676         'AI': 'Anguilla',
 2677         'AQ': 'Antarctica',
 2678         'AG': 'Antigua and Barbuda',
 2679         'AR': 'Argentina',
 2680         'AM': 'Armenia',
 2681         'AW': 'Aruba',
 2682         'AU': 'Australia',
 2683         'AT': 'Austria',
 2684         'AZ': 'Azerbaijan',
 2685         'BS': 'Bahamas',
 2686         'BH': 'Bahrain',
 2687         'BD': 'Bangladesh',
 2688         'BB': 'Barbados',
 2689         'BY': 'Belarus',
 2690         'BE': 'Belgium',
 2691         'BZ': 'Belize',
 2692         'BJ': 'Benin',
 2693         'BM': 'Bermuda',
 2694         'BT': 'Bhutan',
 2695         'BO': 'Bolivia, Plurinational State of',
 2696         'BQ': 'Bonaire, Sint Eustatius and Saba',
 2697         'BA': 'Bosnia and Herzegovina',
 2698         'BW': 'Botswana',
 2699         'BV': 'Bouvet Island',
 2700         'BR': 'Brazil',
 2701         'IO': 'British Indian Ocean Territory',
 2702         'BN': 'Brunei Darussalam',
 2703         'BG': 'Bulgaria',
 2704         'BF': 'Burkina Faso',
 2705         'BI': 'Burundi',
 2706         'KH': 'Cambodia',
 2707         'CM': 'Cameroon',
 2708         'CA': 'Canada',
 2709         'CV': 'Cape Verde',
 2710         'KY': 'Cayman Islands',
 2711         'CF': 'Central African Republic',
 2712         'TD': 'Chad',
 2713         'CL': 'Chile',
 2714         'CN': 'China',
 2715         'CX': 'Christmas Island',
 2716         'CC': 'Cocos (Keeling) Islands',
 2717         'CO': 'Colombia',
 2718         'KM': 'Comoros',
 2719         'CG': 'Congo',
 2720         'CD': 'Congo, the Democratic Republic of the',
 2721         'CK': 'Cook Islands',
 2722         'CR': 'Costa Rica',
 2723         'CI': 'Côte d\'Ivoire',
 2724         'HR': 'Croatia',
 2725         'CU': 'Cuba',
 2726         'CW': 'Curaçao',
 2727         'CY': 'Cyprus',
 2728         'CZ': 'Czech Republic',
 2729         'DK': 'Denmark',
 2730         'DJ': 'Djibouti',
 2731         'DM': 'Dominica',
 2732         'DO': 'Dominican Republic',
 2733         'EC': 'Ecuador',
 2734         'EG': 'Egypt',
 2735         'SV': 'El Salvador',
 2736         'GQ': 'Equatorial Guinea',
 2737         'ER': 'Eritrea',
 2738         'EE': 'Estonia',
 2739         'ET': 'Ethiopia',
 2740         'FK': 'Falkland Islands (Malvinas)',
 2741         'FO': 'Faroe Islands',
 2742         'FJ': 'Fiji',
 2743         'FI': 'Finland',
 2744         'FR': 'France',
 2745         'GF': 'French Guiana',
 2746         'PF': 'French Polynesia',
 2747         'TF': 'French Southern Territories',
 2748         'GA': 'Gabon',
 2749         'GM': 'Gambia',
 2750         'GE': 'Georgia',
 2751         'DE': 'Germany',
 2752         'GH': 'Ghana',
 2753         'GI': 'Gibraltar',
 2754         'GR': 'Greece',
 2755         'GL': 'Greenland',
 2756         'GD': 'Grenada',
 2757         'GP': 'Guadeloupe',
 2758         'GU': 'Guam',
 2759         'GT': 'Guatemala',
 2760         'GG': 'Guernsey',
 2761         'GN': 'Guinea',
 2762         'GW': 'Guinea-Bissau',
 2763         'GY': 'Guyana',
 2764         'HT': 'Haiti',
 2765         'HM': 'Heard Island and McDonald Islands',
 2766         'VA': 'Holy See (Vatican City State)',
 2767         'HN': 'Honduras',
 2768         'HK': 'Hong Kong',
 2769         'HU': 'Hungary',
 2770         'IS': 'Iceland',
 2771         'IN': 'India',
 2772         'ID': 'Indonesia',
 2773         'IR': 'Iran, Islamic Republic of',
 2774         'IQ': 'Iraq',
 2775         'IE': 'Ireland',
 2776         'IM': 'Isle of Man',
 2777         'IL': 'Israel',
 2778         'IT': 'Italy',
 2779         'JM': 'Jamaica',
 2780         'JP': 'Japan',
 2781         'JE': 'Jersey',
 2782         'JO': 'Jordan',
 2783         'KZ': 'Kazakhstan',
 2784         'KE': 'Kenya',
 2785         'KI': 'Kiribati',
 2786         'KP': 'Korea, Democratic People\'s Republic of',
 2787         'KR': 'Korea, Republic of',
 2788         'KW': 'Kuwait',
 2789         'KG': 'Kyrgyzstan',
 2790         'LA': 'Lao People\'s Democratic Republic',
 2791         'LV': 'Latvia',
 2792         'LB': 'Lebanon',
 2793         'LS': 'Lesotho',
 2794         'LR': 'Liberia',
 2795         'LY': 'Libya',
 2796         'LI': 'Liechtenstein',
 2797         'LT': 'Lithuania',
 2798         'LU': 'Luxembourg',
 2799         'MO': 'Macao',
 2800         'MK': 'Macedonia, the Former Yugoslav Republic of',
 2801         'MG': 'Madagascar',
 2802         'MW': 'Malawi',
 2803         'MY': 'Malaysia',
 2804         'MV': 'Maldives',
 2805         'ML': 'Mali',
 2806         'MT': 'Malta',
 2807         'MH': 'Marshall Islands',
 2808         'MQ': 'Martinique',
 2809         'MR': 'Mauritania',
 2810         'MU': 'Mauritius',
 2811         'YT': 'Mayotte',
 2812         'MX': 'Mexico',
 2813         'FM': 'Micronesia, Federated States of',
 2814         'MD': 'Moldova, Republic of',
 2815         'MC': 'Monaco',
 2816         'MN': 'Mongolia',
 2817         'ME': 'Montenegro',
 2818         'MS': 'Montserrat',
 2819         'MA': 'Morocco',
 2820         'MZ': 'Mozambique',
 2821         'MM': 'Myanmar',
 2822         'NA': 'Namibia',
 2823         'NR': 'Nauru',
 2824         'NP': 'Nepal',
 2825         'NL': 'Netherlands',
 2826         'NC': 'New Caledonia',
 2827         'NZ': 'New Zealand',
 2828         'NI': 'Nicaragua',
 2829         'NE': 'Niger',
 2830         'NG': 'Nigeria',
 2831         'NU': 'Niue',
 2832         'NF': 'Norfolk Island',
 2833         'MP': 'Northern Mariana Islands',
 2834         'NO': 'Norway',
 2835         'OM': 'Oman',
 2836         'PK': 'Pakistan',
 2837         'PW': 'Palau',
 2838         'PS': 'Palestine, State of',
 2839         'PA': 'Panama',
 2840         'PG': 'Papua New Guinea',
 2841         'PY': 'Paraguay',
 2842         'PE': 'Peru',
 2843         'PH': 'Philippines',
 2844         'PN': 'Pitcairn',
 2845         'PL': 'Poland',
 2846         'PT': 'Portugal',
 2847         'PR': 'Puerto Rico',
 2848         'QA': 'Qatar',
 2849         'RE': 'Réunion',
 2850         'RO': 'Romania',
 2851         'RU': 'Russian Federation',
 2852         'RW': 'Rwanda',
 2853         'BL': 'Saint Barthélemy',
 2854         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
 2855         'KN': 'Saint Kitts and Nevis',
 2856         'LC': 'Saint Lucia',
 2857         'MF': 'Saint Martin (French part)',
 2858         'PM': 'Saint Pierre and Miquelon',
 2859         'VC': 'Saint Vincent and the Grenadines',
 2860         'WS': 'Samoa',
 2861         'SM': 'San Marino',
 2862         'ST': 'Sao Tome and Principe',
 2863         'SA': 'Saudi Arabia',
 2864         'SN': 'Senegal',
 2865         'RS': 'Serbia',
 2866         'SC': 'Seychelles',
 2867         'SL': 'Sierra Leone',
 2868         'SG': 'Singapore',
 2869         'SX': 'Sint Maarten (Dutch part)',
 2870         'SK': 'Slovakia',
 2871         'SI': 'Slovenia',
 2872         'SB': 'Solomon Islands',
 2873         'SO': 'Somalia',
 2874         'ZA': 'South Africa',
 2875         'GS': 'South Georgia and the South Sandwich Islands',
 2876         'SS': 'South Sudan',
 2877         'ES': 'Spain',
 2878         'LK': 'Sri Lanka',
 2879         'SD': 'Sudan',
 2880         'SR': 'Suriname',
 2881         'SJ': 'Svalbard and Jan Mayen',
 2882         'SZ': 'Swaziland',
 2883         'SE': 'Sweden',
 2884         'CH': 'Switzerland',
 2885         'SY': 'Syrian Arab Republic',
 2886         'TW': 'Taiwan, Province of China',
 2887         'TJ': 'Tajikistan',
 2888         'TZ': 'Tanzania, United Republic of',
 2889         'TH': 'Thailand',
 2890         'TL': 'Timor-Leste',
 2891         'TG': 'Togo',
 2892         'TK': 'Tokelau',
 2893         'TO': 'Tonga',
 2894         'TT': 'Trinidad and Tobago',
 2895         'TN': 'Tunisia',
 2896         'TR': 'Turkey',
 2897         'TM': 'Turkmenistan',
 2898         'TC': 'Turks and Caicos Islands',
 2899         'TV': 'Tuvalu',
 2900         'UG': 'Uganda',
 2901         'UA': 'Ukraine',
 2902         'AE': 'United Arab Emirates',
 2903         'GB': 'United Kingdom',
 2904         'US': 'United States',
 2905         'UM': 'United States Minor Outlying Islands',
 2906         'UY': 'Uruguay',
 2907         'UZ': 'Uzbekistan',
 2908         'VU': 'Vanuatu',
 2909         'VE': 'Venezuela, Bolivarian Republic of',
 2910         'VN': 'Viet Nam',
 2911         'VG': 'Virgin Islands, British',
 2912         'VI': 'Virgin Islands, U.S.',
 2913         'WF': 'Wallis and Futuna',
 2914         'EH': 'Western Sahara',
 2915         'YE': 'Yemen',
 2916         'ZM': 'Zambia',
 2917         'ZW': 'Zimbabwe',
 2918     }
 2919 
 2920     @classmethod
 2921     def short2full(cls, code):
 2922         """Convert an ISO 3166-2 country code to the corresponding full name"""
 2923         return cls._country_map.get(code.upper())
 2924 
 2925 
 2926 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
 2927     def __init__(self, proxies=None):
 2928         # Set default handlers
 2929         for type in ('http', 'https'):
 2930             setattr(self, '%s_open' % type,
 2931                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
 2932                         meth(r, proxy, type))
 2933         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
 2934 
 2935     def proxy_open(self, req, proxy, type):
 2936         req_proxy = req.headers.get('Ytdl-request-proxy')
 2937         if req_proxy is not None:
 2938             proxy = req_proxy
 2939             del req.headers['Ytdl-request-proxy']
 2940 
 2941         if proxy == '__noproxy__':
 2942             return None  # No Proxy
 2943         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
 2944             req.add_header('Ytdl-socks-proxy', proxy)
 2945             # youtube-dl's http/https handlers do wrapping the socket with socks
 2946             return None
 2947         return compat_urllib_request.ProxyHandler.proxy_open(
 2948             self, req, proxy, type)
 2949 
 2950 
 2951 def ohdave_rsa_encrypt(data, exponent, modulus):
 2952     '''
 2953     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
 2954 
 2955     Input:
 2956         data: data to encrypt, bytes-like object
 2957         exponent, modulus: parameter e and N of RSA algorithm, both integer
 2958     Output: hex string of encrypted data
 2959 
 2960     Limitation: supports one block encryption only
 2961     '''
 2962 
 2963     payload = int(binascii.hexlify(data[::-1]), 16)
 2964     encrypted = pow(payload, exponent, modulus)
 2965     return '%x' % encrypted
 2966 
 2967 
 2968 def encode_base_n(num, n, table=None):
 2969     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 2970     if not table:
 2971         table = FULL_TABLE[:n]
 2972 
 2973     if n > len(table):
 2974         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
 2975 
 2976     if num == 0:
 2977         return table[0]
 2978 
 2979     ret = ''
 2980     while num:
 2981         ret = table[num % n] + ret
 2982         num = num // n
 2983     return ret
 2984 
 2985 
 2986 def decode_packed_codes(code):
 2987     mobj = re.search(
 2988         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
 2989         code)
 2990     obfucasted_code, base, count, symbols = mobj.groups()
 2991     base = int(base)
 2992     count = int(count)
 2993     symbols = symbols.split('|')
 2994     symbol_table = {}
 2995 
 2996     while count:
 2997         count -= 1
 2998         base_n_count = encode_base_n(count, base)
 2999         symbol_table[base_n_count] = symbols[count] or base_n_count
 3000 
 3001     return re.sub(
 3002         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
 3003         obfucasted_code)
 3004 
 3005 
 3006 def parse_m3u8_attributes(attrib):
 3007     info = {}
 3008     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
 3009         if val.startswith('"'):
 3010             val = val[1:-1]
 3011         info[key] = val
 3012     return info
 3013 
 3014 
 3015 def urshift(val, n):
 3016     return val >> n if val >= 0 else (val + 0x100000000) >> n
 3017 
 3018 
 3019 # Based on png2str() written by @gdkchan and improved by @yokrysty
 3020 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
 3021 def decode_png(png_data):
 3022     # Reference: https://www.w3.org/TR/PNG/
 3023     header = png_data[8:]
 3024 
 3025     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
 3026         raise IOError('Not a valid PNG file.')
 3027 
 3028     int_map = {1: '>B', 2: '>H', 4: '>I'}
 3029     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
 3030 
 3031     chunks = []
 3032 
 3033     while header:
 3034         length = unpack_integer(header[:4])
 3035         header = header[4:]
 3036 
 3037         chunk_type = header[:4]
 3038         header = header[4:]
 3039 
 3040         chunk_data = header[:length]
 3041         header = header[length:]
 3042 
 3043         header = header[4:]  # Skip CRC
 3044 
 3045         chunks.append({
 3046             'type': chunk_type,
 3047             'length': length,
 3048             'data': chunk_data
 3049         })
 3050 
 3051     ihdr = chunks[0]['data']
 3052 
 3053     width = unpack_integer(ihdr[:4])
 3054     height = unpack_integer(ihdr[4:8])
 3055 
 3056     idat = b''
 3057 
 3058     for chunk in chunks:
 3059         if chunk['type'] == b'IDAT':
 3060             idat += chunk['data']
 3061 
 3062     if not idat:
 3063         raise IOError('Unable to read PNG data.')
 3064 
 3065     decompressed_data = bytearray(zlib.decompress(idat))
 3066 
 3067     stride = width * 3
 3068     pixels = []
 3069 
 3070     def _get_pixel(idx):
 3071         x = idx % stride
 3072         y = idx // stride
 3073         return pixels[y][x]
 3074 
 3075     for y in range(height):
 3076         basePos = y * (1 + stride)
 3077         filter_type = decompressed_data[basePos]
 3078 
 3079         current_row = []
 3080 
 3081         pixels.append(current_row)
 3082 
 3083         for x in range(stride):
 3084             color = decompressed_data[1 + basePos + x]
 3085             basex = y * stride + x
 3086             left = 0
 3087             up = 0
 3088 
 3089             if x > 2:
 3090                 left = _get_pixel(basex - 3)
 3091             if y > 0:
 3092                 up = _get_pixel(basex - stride)
 3093 
 3094             if filter_type == 1:  # Sub
 3095                 color = (color + left) & 0xff
 3096             elif filter_type == 2:  # Up
 3097                 color = (color + up) & 0xff
 3098             elif filter_type == 3:  # Average
 3099                 color = (color + ((left + up) >> 1)) & 0xff
 3100             elif filter_type == 4:  # Paeth
 3101                 a = left
 3102                 b = up
 3103                 c = 0
 3104 
 3105                 if x > 2 and y > 0:
 3106                     c = _get_pixel(basex - stride - 3)
 3107 
 3108                 p = a + b - c
 3109 
 3110                 pa = abs(p - a)
 3111                 pb = abs(p - b)
 3112                 pc = abs(p - c)
 3113 
 3114                 if pa <= pb and pa <= pc:
 3115                     color = (color + a) & 0xff
 3116                 elif pb <= pc:
 3117                     color = (color + b) & 0xff
 3118                 else:
 3119                     color = (color + c) & 0xff
 3120 
 3121             current_row.append(color)
 3122 
 3123     return width, height, pixels

Generated by cgit