youtube_dl/utils.py



    1 #!/usr/bin/env python
    2 # -*- coding: utf-8 -*-
    3 
    4 import calendar
    5 import codecs
    6 import contextlib
    7 import ctypes
    8 import datetime
    9 import email.utils
   10 import errno
   11 import getpass
   12 import gzip
   13 import itertools
   14 import io
   15 import json
   16 import locale
   17 import math
   18 import os
   19 import pipes
   20 import platform
   21 import re
   22 import ssl
   23 import socket
   24 import struct
   25 import subprocess
   26 import sys
   27 import traceback
   28 import xml.etree.ElementTree
   29 import zlib
   30 
   31 try:
   32     import urllib.request as compat_urllib_request
   33 except ImportError: # Python 2
   34     import urllib2 as compat_urllib_request
   35 
   36 try:
   37     import urllib.error as compat_urllib_error
   38 except ImportError: # Python 2
   39     import urllib2 as compat_urllib_error
   40 
   41 try:
   42     import urllib.parse as compat_urllib_parse
   43 except ImportError: # Python 2
   44     import urllib as compat_urllib_parse
   45 
   46 try:
   47     from urllib.parse import urlparse as compat_urllib_parse_urlparse
   48 except ImportError: # Python 2
   49     from urlparse import urlparse as compat_urllib_parse_urlparse
   50 
   51 try:
   52     import urllib.parse as compat_urlparse
   53 except ImportError: # Python 2
   54     import urlparse as compat_urlparse
   55 
   56 try:
   57     import http.cookiejar as compat_cookiejar
   58 except ImportError: # Python 2
   59     import cookielib as compat_cookiejar
   60 
   61 try:
   62     import html.entities as compat_html_entities
   63 except ImportError: # Python 2
   64     import htmlentitydefs as compat_html_entities
   65 
   66 try:
   67     import html.parser as compat_html_parser
   68 except ImportError: # Python 2
   69     import HTMLParser as compat_html_parser
   70 
   71 try:
   72     import http.client as compat_http_client
   73 except ImportError: # Python 2
   74     import httplib as compat_http_client
   75 
   76 try:
   77     from urllib.error import HTTPError as compat_HTTPError
   78 except ImportError:  # Python 2
   79     from urllib2 import HTTPError as compat_HTTPError
   80 
   81 try:
   82     from urllib.request import urlretrieve as compat_urlretrieve
   83 except ImportError:  # Python 2
   84     from urllib import urlretrieve as compat_urlretrieve
   85 
   86 
   87 try:
   88     from subprocess import DEVNULL
   89     compat_subprocess_get_DEVNULL = lambda: DEVNULL
   90 except ImportError:
   91     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
   92 
   93 try:
   94     from urllib.parse import unquote as compat_urllib_parse_unquote
   95 except ImportError:
   96     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
   97         if string == '':
   98             return string
   99         res = string.split('%')
  100         if len(res) == 1:
  101             return string
  102         if encoding is None:
  103             encoding = 'utf-8'
  104         if errors is None:
  105             errors = 'replace'
  106         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  107         pct_sequence = b''
  108         string = res[0]
  109         for item in res[1:]:
  110             try:
  111                 if not item:
  112                     raise ValueError
  113                 pct_sequence += item[:2].decode('hex')
  114                 rest = item[2:]
  115                 if not rest:
  116                     # This segment was just a single percent-encoded character.
  117                     # May be part of a sequence of code units, so delay decoding.
  118                     # (Stored in pct_sequence).
  119                     continue
  120             except ValueError:
  121                 rest = '%' + item
  122             # Encountered non-percent-encoded characters. Flush the current
  123             # pct_sequence.
  124             string += pct_sequence.decode(encoding, errors) + rest
  125             pct_sequence = b''
  126         if pct_sequence:
  127             # Flush the final pct_sequence
  128             string += pct_sequence.decode(encoding, errors)
  129         return string
  130 
  131 
  132 try:
  133     from urllib.parse import parse_qs as compat_parse_qs
  134 except ImportError: # Python 2
  135     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  136     # Python 2's version is apparently totally broken
  137 
  138     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
  139                 encoding='utf-8', errors='replace'):
  140         qs, _coerce_result = qs, unicode
  141         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
  142         r = []
  143         for name_value in pairs:
  144             if not name_value and not strict_parsing:
  145                 continue
  146             nv = name_value.split('=', 1)
  147             if len(nv) != 2:
  148                 if strict_parsing:
  149                     raise ValueError("bad query field: %r" % (name_value,))
  150                 # Handle case of a control-name with no equal sign
  151                 if keep_blank_values:
  152                     nv.append('')
  153                 else:
  154                     continue
  155             if len(nv[1]) or keep_blank_values:
  156                 name = nv[0].replace('+', ' ')
  157                 name = compat_urllib_parse_unquote(
  158                     name, encoding=encoding, errors=errors)
  159                 name = _coerce_result(name)
  160                 value = nv[1].replace('+', ' ')
  161                 value = compat_urllib_parse_unquote(
  162                     value, encoding=encoding, errors=errors)
  163                 value = _coerce_result(value)
  164                 r.append((name, value))
  165         return r
  166 
  167     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
  168                 encoding='utf-8', errors='replace'):
  169         parsed_result = {}
  170         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
  171                         encoding=encoding, errors=errors)
  172         for name, value in pairs:
  173             if name in parsed_result:
  174                 parsed_result[name].append(value)
  175             else:
  176                 parsed_result[name] = [value]
  177         return parsed_result
  178 
  179 try:
  180     compat_str = unicode # Python 2
  181 except NameError:
  182     compat_str = str
  183 
  184 try:
  185     compat_chr = unichr # Python 2
  186 except NameError:
  187     compat_chr = chr
  188 
  189 try:
  190     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
  191 except ImportError:  # Python 2.6
  192     from xml.parsers.expat import ExpatError as compat_xml_parse_error
  193 
  194 def compat_ord(c):
  195     if type(c) is int: return c
  196     else: return ord(c)
  197 
  198 # This is not clearly defined otherwise
  199 compiled_regex_type = type(re.compile(''))
  200 
  201 std_headers = {
  202     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  203     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  204     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  205     'Accept-Encoding': 'gzip, deflate',
  206     'Accept-Language': 'en-us,en;q=0.5',
  207 }
  208 
  209 def preferredencoding():
  210     """Get preferred encoding.
  211 
  212     Returns the best encoding scheme for the system, based on
  213     locale.getpreferredencoding() and some further tweaks.
  214     """
  215     try:
  216         pref = locale.getpreferredencoding()
  217         u'TEST'.encode(pref)
  218     except:
  219         pref = 'UTF-8'
  220 
  221     return pref
  222 
  223 if sys.version_info < (3,0):
  224     def compat_print(s):
  225         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
  226 else:
  227     def compat_print(s):
  228         assert type(s) == type(u'')
  229         print(s)
  230 
  231 # In Python 2.x, json.dump expects a bytestream.
  232 # In Python 3.x, it writes to a character stream
  233 if sys.version_info < (3,0):
  234     def write_json_file(obj, fn):
  235         with open(fn, 'wb') as f:
  236             json.dump(obj, f)
  237 else:
  238     def write_json_file(obj, fn):
  239         with open(fn, 'w', encoding='utf-8') as f:
  240             json.dump(obj, f)
  241 
  242 if sys.version_info >= (2,7):
  243     def find_xpath_attr(node, xpath, key, val):
  244         """ Find the xpath xpath[@key=val] """
  245         assert re.match(r'^[a-zA-Z-]+$', key)
  246         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
  247         expr = xpath + u"[@%s='%s']" % (key, val)
  248         return node.find(expr)
  249 else:
  250     def find_xpath_attr(node, xpath, key, val):
  251         for f in node.findall(xpath):
  252             if f.attrib.get(key) == val:
  253                 return f
  254         return None
  255 
  256 # On python2.6 the xml.etree.ElementTree.Element methods don't support
  257 # the namespace parameter
  258 def xpath_with_ns(path, ns_map):
  259     components = [c.split(':') for c in path.split('/')]
  260     replaced = []
  261     for c in components:
  262         if len(c) == 1:
  263             replaced.append(c[0])
  264         else:
  265             ns, tag = c
  266             replaced.append('{%s}%s' % (ns_map[ns], tag))
  267     return '/'.join(replaced)
  268 
  269 def htmlentity_transform(matchobj):
  270     """Transforms an HTML entity to a character.
  271 
  272     This function receives a match object and is intended to be used with
  273     the re.sub() function.
  274     """
  275     entity = matchobj.group(1)
  276 
  277     # Known non-numeric HTML entity
  278     if entity in compat_html_entities.name2codepoint:
  279         return compat_chr(compat_html_entities.name2codepoint[entity])
  280 
  281     mobj = re.match(u'(?u)#(x?\\d+)', entity)
  282     if mobj is not None:
  283         numstr = mobj.group(1)
  284         if numstr.startswith(u'x'):
  285             base = 16
  286             numstr = u'0%s' % numstr
  287         else:
  288             base = 10
  289         return compat_chr(int(numstr, base))
  290 
  291     # Unknown entity in name, return its literal representation
  292     return (u'&%s;' % entity)
  293 
  294 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  295 class BaseHTMLParser(compat_html_parser.HTMLParser):
  296     def __init(self):
  297         compat_html_parser.HTMLParser.__init__(self)
  298         self.html = None
  299 
  300     def loads(self, html):
  301         self.html = html
  302         self.feed(html)
  303         self.close()
  304 
  305 class AttrParser(BaseHTMLParser):
  306     """Modified HTMLParser that isolates a tag with the specified attribute"""
  307     def __init__(self, attribute, value):
  308         self.attribute = attribute
  309         self.value = value
  310         self.result = None
  311         self.started = False
  312         self.depth = {}
  313         self.watch_startpos = False
  314         self.error_count = 0
  315         BaseHTMLParser.__init__(self)
  316 
  317     def error(self, message):
  318         if self.error_count > 10 or self.started:
  319             raise compat_html_parser.HTMLParseError(message, self.getpos())
  320         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  321         self.error_count += 1
  322         self.goahead(1)
  323 
  324     def handle_starttag(self, tag, attrs):
  325         attrs = dict(attrs)
  326         if self.started:
  327             self.find_startpos(None)
  328         if self.attribute in attrs and attrs[self.attribute] == self.value:
  329             self.result = [tag]
  330             self.started = True
  331             self.watch_startpos = True
  332         if self.started:
  333             if not tag in self.depth: self.depth[tag] = 0
  334             self.depth[tag] += 1
  335 
  336     def handle_endtag(self, tag):
  337         if self.started:
  338             if tag in self.depth: self.depth[tag] -= 1
  339             if self.depth[self.result[0]] == 0:
  340                 self.started = False
  341                 self.result.append(self.getpos())
  342 
  343     def find_startpos(self, x):
  344         """Needed to put the start position of the result (self.result[1])
  345         after the opening tag with the requested id"""
  346         if self.watch_startpos:
  347             self.watch_startpos = False
  348             self.result.append(self.getpos())
  349     handle_entityref = handle_charref = handle_data = handle_comment = \
  350     handle_decl = handle_pi = unknown_decl = find_startpos
  351 
  352     def get_result(self):
  353         if self.result is None:
  354             return None
  355         if len(self.result) != 3:
  356             return None
  357         lines = self.html.split('\n')
  358         lines = lines[self.result[1][0]-1:self.result[2][0]]
  359         lines[0] = lines[0][self.result[1][1]:]
  360         if len(lines) == 1:
  361             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
  362         lines[-1] = lines[-1][:self.result[2][1]]
  363         return '\n'.join(lines).strip()
  364 # Hack for https://github.com/rg3/youtube-dl/issues/662
  365 if sys.version_info < (2, 7, 3):
  366     AttrParser.parse_endtag = (lambda self, i:
  367         i + len("</scr'+'ipt>")
  368         if self.rawdata[i:].startswith("</scr'+'ipt>")
  369         else compat_html_parser.HTMLParser.parse_endtag(self, i))
  370 
  371 def get_element_by_id(id, html):
  372     """Return the content of the tag with the specified ID in the passed HTML document"""
  373     return get_element_by_attribute("id", id, html)
  374 
  375 def get_element_by_attribute(attribute, value, html):
  376     """Return the content of the tag with the specified attribute in the passed HTML document"""
  377     parser = AttrParser(attribute, value)
  378     try:
  379         parser.loads(html)
  380     except compat_html_parser.HTMLParseError:
  381         pass
  382     return parser.get_result()
  383 
  384 class MetaParser(BaseHTMLParser):
  385     """
  386     Modified HTMLParser that isolates a meta tag with the specified name 
  387     attribute.
  388     """
  389     def __init__(self, name):
  390         BaseHTMLParser.__init__(self)
  391         self.name = name
  392         self.content = None
  393         self.result = None
  394 
  395     def handle_starttag(self, tag, attrs):
  396         if tag != 'meta':
  397             return
  398         attrs = dict(attrs)
  399         if attrs.get('name') == self.name:
  400             self.result = attrs.get('content')
  401 
  402     def get_result(self):
  403         return self.result
  404 
  405 def get_meta_content(name, html):
  406     """
  407     Return the content attribute from the meta tag with the given name attribute.
  408     """
  409     parser = MetaParser(name)
  410     try:
  411         parser.loads(html)
  412     except compat_html_parser.HTMLParseError:
  413         pass
  414     return parser.get_result()
  415 
  416 
  417 def clean_html(html):
  418     """Clean an HTML snippet into a readable string"""
  419     # Newline vs <br />
  420     html = html.replace('\n', ' ')
  421     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
  422     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
  423     # Strip html tags
  424     html = re.sub('<.*?>', '', html)
  425     # Replace html entities
  426     html = unescapeHTML(html)
  427     return html.strip()
  428 
  429 
  430 def sanitize_open(filename, open_mode):
  431     """Try to open the given filename, and slightly tweak it if this fails.
  432 
  433     Attempts to open the given filename. If this fails, it tries to change
  434     the filename slightly, step by step, until it's either able to open it
  435     or it fails and raises a final exception, like the standard open()
  436     function.
  437 
  438     It returns the tuple (stream, definitive_file_name).
  439     """
  440     try:
  441         if filename == u'-':
  442             if sys.platform == 'win32':
  443                 import msvcrt
  444                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  445             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  446         stream = open(encodeFilename(filename), open_mode)
  447         return (stream, filename)
  448     except (IOError, OSError) as err:
  449         if err.errno in (errno.EACCES,):
  450             raise
  451 
  452         # In case of error, try to remove win32 forbidden chars
  453         alt_filename = os.path.join(
  454                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
  455                         for path_part in os.path.split(filename)
  456                        )
  457         if alt_filename == filename:
  458             raise
  459         else:
  460             # An exception here should be caught in the caller
  461             stream = open(encodeFilename(filename), open_mode)
  462             return (stream, alt_filename)
  463 
  464 
  465 def timeconvert(timestr):
  466     """Convert RFC 2822 defined time string into system timestamp"""
  467     timestamp = None
  468     timetuple = email.utils.parsedate_tz(timestr)
  469     if timetuple is not None:
  470         timestamp = email.utils.mktime_tz(timetuple)
  471     return timestamp
  472 
  473 def sanitize_filename(s, restricted=False, is_id=False):
  474     """Sanitizes a string so it could be used as part of a filename.
  475     If restricted is set, use a stricter subset of allowed characters.
  476     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
  477     """
  478     def replace_insane(char):
  479         if char == '?' or ord(char) < 32 or ord(char) == 127:
  480             return ''
  481         elif char == '"':
  482             return '' if restricted else '\''
  483         elif char == ':':
  484             return '_-' if restricted else ' -'
  485         elif char in '\\/|*<>':
  486             return '_'
  487         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
  488             return '_'
  489         if restricted and ord(char) > 127:
  490             return '_'
  491         return char
  492 
  493     result = u''.join(map(replace_insane, s))
  494     if not is_id:
  495         while '__' in result:
  496             result = result.replace('__', '_')
  497         result = result.strip('_')
  498         # Common case of "Foreign band name - English song title"
  499         if restricted and result.startswith('-_'):
  500             result = result[2:]
  501         if not result:
  502             result = '_'
  503     return result
  504 
  505 def orderedSet(iterable):
  506     """ Remove all duplicates from the input iterable """
  507     res = []
  508     for el in iterable:
  509         if el not in res:
  510             res.append(el)
  511     return res
  512 
  513 
  514 def unescapeHTML(s):
  515     if s is None:
  516         return None
  517     assert type(s) == compat_str
  518 
  519     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
  520     return result
  521 
  522 
  523 def encodeFilename(s, for_subprocess=False):
  524     """
  525     @param s The name of the file
  526     """
  527 
  528     assert type(s) == compat_str
  529 
  530     # Python 3 has a Unicode API
  531     if sys.version_info >= (3, 0):
  532         return s
  533 
  534     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
  535         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
  536         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
  537         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
  538         if not for_subprocess:
  539             return s
  540         else:
  541             # For subprocess calls, encode with locale encoding
  542             # Refer to http://stackoverflow.com/a/9951851/35070
  543             encoding = preferredencoding()
  544     else:
  545         encoding = sys.getfilesystemencoding()
  546     if encoding is None:
  547         encoding = 'utf-8'
  548     return s.encode(encoding, 'ignore')
  549 
  550 
  551 def encodeArgument(s):
  552     if not isinstance(s, compat_str):
  553         # Legacy code that uses byte strings
  554         # Uncomment the following line after fixing all post processors
  555         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
  556         s = s.decode('ascii')
  557     return encodeFilename(s, True)
  558 
  559 
  560 def decodeOption(optval):
  561     if optval is None:
  562         return optval
  563     if isinstance(optval, bytes):
  564         optval = optval.decode(preferredencoding())
  565 
  566     assert isinstance(optval, compat_str)
  567     return optval
  568 
  569 def formatSeconds(secs):
  570     if secs > 3600:
  571         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
  572     elif secs > 60:
  573         return '%d:%02d' % (secs // 60, secs % 60)
  574     else:
  575         return '%d' % secs
  576 
  577 
  578 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
  579     if sys.version_info < (3, 2):
  580         import httplib
  581 
  582         class HTTPSConnectionV3(httplib.HTTPSConnection):
  583             def __init__(self, *args, **kwargs):
  584                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
  585 
  586             def connect(self):
  587                 sock = socket.create_connection((self.host, self.port), self.timeout)
  588                 if getattr(self, '_tunnel_host', False):
  589                     self.sock = sock
  590                     self._tunnel()
  591                 try:
  592                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
  593                 except ssl.SSLError:
  594                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
  595 
  596         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
  597             def https_open(self, req):
  598                 return self.do_open(HTTPSConnectionV3, req)
  599         return HTTPSHandlerV3(**kwargs)
  600     else:
  601         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
  602         context.verify_mode = (ssl.CERT_NONE
  603                                if opts_no_check_certificate
  604                                else ssl.CERT_REQUIRED)
  605         context.set_default_verify_paths()
  606         try:
  607             context.load_default_certs()
  608         except AttributeError:
  609             pass  # Python < 3.4
  610         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
  611 
  612 class ExtractorError(Exception):
  613     """Error during info extraction."""
  614     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
  615         """ tb, if given, is the original traceback (so that it can be printed out).
  616         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
  617         """
  618 
  619         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
  620             expected = True
  621         if video_id is not None:
  622             msg = video_id + ': ' + msg
  623         if not expected:
  624             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
  625         super(ExtractorError, self).__init__(msg)
  626 
  627         self.traceback = tb
  628         self.exc_info = sys.exc_info()  # preserve original exception
  629         self.cause = cause
  630         self.video_id = video_id
  631 
  632     def format_traceback(self):
  633         if self.traceback is None:
  634             return None
  635         return u''.join(traceback.format_tb(self.traceback))
  636 
  637 
  638 class RegexNotFoundError(ExtractorError):
  639     """Error when a regex didn't match"""
  640     pass
  641 
  642 
  643 class DownloadError(Exception):
  644     """Download Error exception.
  645 
  646     This exception may be thrown by FileDownloader objects if they are not
  647     configured to continue on errors. They will contain the appropriate
  648     error message.
  649     """
  650     def __init__(self, msg, exc_info=None):
  651         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  652         super(DownloadError, self).__init__(msg)
  653         self.exc_info = exc_info
  654 
  655 
  656 class SameFileError(Exception):
  657     """Same File exception.
  658 
  659     This exception will be thrown by FileDownloader objects if they detect
  660     multiple files would have to be downloaded to the same file on disk.
  661     """
  662     pass
  663 
  664 
  665 class PostProcessingError(Exception):
  666     """Post Processing exception.
  667 
  668     This exception may be raised by PostProcessor's .run() method to
  669     indicate an error in the postprocessing task.
  670     """
  671     def __init__(self, msg):
  672         self.msg = msg
  673 
  674 class MaxDownloadsReached(Exception):
  675     """ --max-downloads limit has been reached. """
  676     pass
  677 
  678 
  679 class UnavailableVideoError(Exception):
  680     """Unavailable Format exception.
  681 
  682     This exception will be thrown when a video is requested
  683     in a format that is not available for that video.
  684     """
  685     pass
  686 
  687 
  688 class ContentTooShortError(Exception):
  689     """Content Too Short exception.
  690 
  691     This exception may be raised by FileDownloader objects when a file they
  692     download is too small for what the server announced first, indicating
  693     the connection was probably interrupted.
  694     """
  695     # Both in bytes
  696     downloaded = None
  697     expected = None
  698 
  699     def __init__(self, downloaded, expected):
  700         self.downloaded = downloaded
  701         self.expected = expected
  702 
  703 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
  704     """Handler for HTTP requests and responses.
  705 
  706     This class, when installed with an OpenerDirector, automatically adds
  707     the standard headers to every HTTP request and handles gzipped and
  708     deflated responses from web servers. If compression is to be avoided in
  709     a particular request, the original request in the program code only has
  710     to include the HTTP header "Youtubedl-No-Compression", which will be
  711     removed before making the real request.
  712 
  713     Part of this code was copied from:
  714 
  715     http://techknack.net/python-urllib2-handlers/
  716 
  717     Andrew Rowls, the author of that code, agreed to release it to the
  718     public domain.
  719     """
  720 
  721     @staticmethod
  722     def deflate(data):
  723         try:
  724             return zlib.decompress(data, -zlib.MAX_WBITS)
  725         except zlib.error:
  726             return zlib.decompress(data)
  727 
  728     @staticmethod
  729     def addinfourl_wrapper(stream, headers, url, code):
  730         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
  731             return compat_urllib_request.addinfourl(stream, headers, url, code)
  732         ret = compat_urllib_request.addinfourl(stream, headers, url)
  733         ret.code = code
  734         return ret
  735 
  736     def http_request(self, req):
  737         for h,v in std_headers.items():
  738             if h in req.headers:
  739                 del req.headers[h]
  740             req.add_header(h, v)
  741         if 'Youtubedl-no-compression' in req.headers:
  742             if 'Accept-encoding' in req.headers:
  743                 del req.headers['Accept-encoding']
  744             del req.headers['Youtubedl-no-compression']
  745         if 'Youtubedl-user-agent' in req.headers:
  746             if 'User-agent' in req.headers:
  747                 del req.headers['User-agent']
  748             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
  749             del req.headers['Youtubedl-user-agent']
  750         return req
  751 
  752     def http_response(self, req, resp):
  753         old_resp = resp
  754         # gzip
  755         if resp.headers.get('Content-encoding', '') == 'gzip':
  756             content = resp.read()
  757             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
  758             try:
  759                 uncompressed = io.BytesIO(gz.read())
  760             except IOError as original_ioerror:
  761                 # There may be junk add the end of the file
  762                 # See http://stackoverflow.com/q/4928560/35070 for details
  763                 for i in range(1, 1024):
  764                     try:
  765                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
  766                         uncompressed = io.BytesIO(gz.read())
  767                     except IOError:
  768                         continue
  769                     break
  770                 else:
  771                     raise original_ioerror
  772             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
  773             resp.msg = old_resp.msg
  774         # deflate
  775         if resp.headers.get('Content-encoding', '') == 'deflate':
  776             gz = io.BytesIO(self.deflate(resp.read()))
  777             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
  778             resp.msg = old_resp.msg
  779         return resp
  780 
  781     https_request = http_request
  782     https_response = http_response
  783 
  784 
  785 def parse_iso8601(date_str, delimiter='T'):
  786     """ Return a UNIX timestamp from the given date """
  787 
  788     if date_str is None:
  789         return None
  790 
  791     m = re.search(
  792         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
  793         date_str)
  794     if not m:
  795         timezone = datetime.timedelta()
  796     else:
  797         date_str = date_str[:-len(m.group(0))]
  798         if not m.group('sign'):
  799             timezone = datetime.timedelta()
  800         else:
  801             sign = 1 if m.group('sign') == '+' else -1
  802             timezone = datetime.timedelta(
  803                 hours=sign * int(m.group('hours')),
  804                 minutes=sign * int(m.group('minutes')))
  805     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
  806     dt = datetime.datetime.strptime(date_str, date_format) - timezone
  807     return calendar.timegm(dt.timetuple())
  808 
  809 
  810 def unified_strdate(date_str):
  811     """Return a string with the date in the format YYYYMMDD"""
  812 
  813     if date_str is None:
  814         return None
  815 
  816     upload_date = None
  817     #Replace commas
  818     date_str = date_str.replace(',', ' ')
  819     # %z (UTC offset) is only supported in python>=3.2
  820     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
  821     format_expressions = [
  822         '%d %B %Y',
  823         '%d %b %Y',
  824         '%B %d %Y',
  825         '%b %d %Y',
  826         '%b %dst %Y %I:%M%p',
  827         '%b %dnd %Y %I:%M%p',
  828         '%b %dth %Y %I:%M%p',
  829         '%Y-%m-%d',
  830         '%d.%m.%Y',
  831         '%d/%m/%Y',
  832         '%Y/%m/%d %H:%M:%S',
  833         '%Y-%m-%d %H:%M:%S',
  834         '%d.%m.%Y %H:%M',
  835         '%d.%m.%Y %H.%M',
  836         '%Y-%m-%dT%H:%M:%SZ',
  837         '%Y-%m-%dT%H:%M:%S.%fZ',
  838         '%Y-%m-%dT%H:%M:%S.%f0Z',
  839         '%Y-%m-%dT%H:%M:%S',
  840         '%Y-%m-%dT%H:%M:%S.%f',
  841         '%Y-%m-%dT%H:%M',
  842     ]
  843     for expression in format_expressions:
  844         try:
  845             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
  846         except ValueError:
  847             pass
  848     if upload_date is None:
  849         timetuple = email.utils.parsedate_tz(date_str)
  850         if timetuple:
  851             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
  852     return upload_date
  853 
  854 def determine_ext(url, default_ext=u'unknown_video'):
  855     if url is None:
  856         return default_ext
  857     guess = url.partition(u'?')[0].rpartition(u'.')[2]
  858     if re.match(r'^[A-Za-z0-9]+$', guess):
  859         return guess
  860     else:
  861         return default_ext
  862 
  863 def subtitles_filename(filename, sub_lang, sub_format):
  864     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
  865 
  866 def date_from_str(date_str):
  867     """
  868     Return a datetime object from a string in the format YYYYMMDD or
  869     (now|today)[+-][0-9](day|week|month|year)(s)?"""
  870     today = datetime.date.today()
  871     if date_str == 'now'or date_str == 'today':
  872         return today
  873     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
  874     if match is not None:
  875         sign = match.group('sign')
  876         time = int(match.group('time'))
  877         if sign == '-':
  878             time = -time
  879         unit = match.group('unit')
  880         #A bad aproximation?
  881         if unit == 'month':
  882             unit = 'day'
  883             time *= 30
  884         elif unit == 'year':
  885             unit = 'day'
  886             time *= 365
  887         unit += 's'
  888         delta = datetime.timedelta(**{unit: time})
  889         return today + delta
  890     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
  891     
  892 def hyphenate_date(date_str):
  893     """
  894     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
  895     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
  896     if match is not None:
  897         return '-'.join(match.groups())
  898     else:
  899         return date_str
  900 
  901 class DateRange(object):
  902     """Represents a time interval between two dates"""
  903     def __init__(self, start=None, end=None):
  904         """start and end must be strings in the format accepted by date"""
  905         if start is not None:
  906             self.start = date_from_str(start)
  907         else:
  908             self.start = datetime.datetime.min.date()
  909         if end is not None:
  910             self.end = date_from_str(end)
  911         else:
  912             self.end = datetime.datetime.max.date()
  913         if self.start > self.end:
  914             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
  915     @classmethod
  916     def day(cls, day):
  917         """Returns a range that only contains the given day"""
  918         return cls(day,day)
  919     def __contains__(self, date):
  920         """Check if the date is in the range"""
  921         if not isinstance(date, datetime.date):
  922             date = date_from_str(date)
  923         return self.start <= date <= self.end
  924     def __str__(self):
  925         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
  926 
  927 
  928 def platform_name():
  929     """ Returns the platform name as a compat_str """
  930     res = platform.platform()
  931     if isinstance(res, bytes):
  932         res = res.decode(preferredencoding())
  933 
  934     assert isinstance(res, compat_str)
  935     return res
  936 
  937 
  938 def _windows_write_string(s, out):
  939     """ Returns True if the string was written using special methods,
  940     False if it has yet to be written out."""
  941     # Adapted from http://stackoverflow.com/a/3259271/35070
  942 
  943     import ctypes
  944     import ctypes.wintypes
  945 
  946     WIN_OUTPUT_IDS = {
  947         1: -11,
  948         2: -12,
  949     }
  950 
  951     try:
  952         fileno = out.fileno()
  953     except AttributeError:
  954         # If the output stream doesn't have a fileno, it's virtual
  955         return False
  956     if fileno not in WIN_OUTPUT_IDS:
  957         return False
  958 
  959     GetStdHandle = ctypes.WINFUNCTYPE(
  960         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
  961         ("GetStdHandle", ctypes.windll.kernel32))
  962     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
  963 
  964     WriteConsoleW = ctypes.WINFUNCTYPE(
  965         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
  966         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
  967         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
  968     written = ctypes.wintypes.DWORD(0)
  969 
  970     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
  971     FILE_TYPE_CHAR = 0x0002
  972     FILE_TYPE_REMOTE = 0x8000
  973     GetConsoleMode = ctypes.WINFUNCTYPE(
  974         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
  975         ctypes.POINTER(ctypes.wintypes.DWORD))(
  976         ("GetConsoleMode", ctypes.windll.kernel32))
  977     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
  978 
  979     def not_a_console(handle):
  980         if handle == INVALID_HANDLE_VALUE or handle is None:
  981             return True
  982         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
  983                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
  984 
  985     if not_a_console(h):
  986         return False
  987 
  988     def next_nonbmp_pos(s):
  989         try:
  990             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
  991         except StopIteration:
  992             return len(s)
  993 
  994     while s:
  995         count = min(next_nonbmp_pos(s), 1024)
  996 
  997         ret = WriteConsoleW(
  998             h, s, count if count else 2, ctypes.byref(written), None)
  999         if ret == 0:
 1000             raise OSError('Failed to write string')
 1001         if not count:  # We just wrote a non-BMP character
 1002             assert written.value == 2
 1003             s = s[1:]
 1004         else:
 1005             assert written.value > 0
 1006             s = s[written.value:]
 1007     return True
 1008 
 1009 
 1010 def write_string(s, out=None, encoding=None):
 1011     if out is None:
 1012         out = sys.stderr
 1013     assert type(s) == compat_str
 1014 
 1015     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 1016         if _windows_write_string(s, out):
 1017             return
 1018 
 1019     if ('b' in getattr(out, 'mode', '') or
 1020             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 1021         byt = s.encode(encoding or preferredencoding(), 'ignore')
 1022         out.write(byt)
 1023     elif hasattr(out, 'buffer'):
 1024         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 1025         byt = s.encode(enc, 'ignore')
 1026         out.buffer.write(byt)
 1027     else:
 1028         out.write(s)
 1029     out.flush()
 1030 
 1031 
 1032 def bytes_to_intlist(bs):
 1033     if not bs:
 1034         return []
 1035     if isinstance(bs[0], int):  # Python 3
 1036         return list(bs)
 1037     else:
 1038         return [ord(c) for c in bs]
 1039 
 1040 
 1041 def intlist_to_bytes(xs):
 1042     if not xs:
 1043         return b''
 1044     if isinstance(chr(0), bytes):  # Python 2
 1045         return ''.join([chr(x) for x in xs])
 1046     else:
 1047         return bytes(xs)
 1048 
 1049 
 1050 def get_cachedir(params={}):
 1051     cache_root = os.environ.get('XDG_CACHE_HOME',
 1052                                 os.path.expanduser('~/.cache'))
 1053     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 1054 
 1055 
 1056 # Cross-platform file locking
 1057 if sys.platform == 'win32':
 1058     import ctypes.wintypes
 1059     import msvcrt
 1060 
 1061     class OVERLAPPED(ctypes.Structure):
 1062         _fields_ = [
 1063             ('Internal', ctypes.wintypes.LPVOID),
 1064             ('InternalHigh', ctypes.wintypes.LPVOID),
 1065             ('Offset', ctypes.wintypes.DWORD),
 1066             ('OffsetHigh', ctypes.wintypes.DWORD),
 1067             ('hEvent', ctypes.wintypes.HANDLE),
 1068         ]
 1069 
 1070     kernel32 = ctypes.windll.kernel32
 1071     LockFileEx = kernel32.LockFileEx
 1072     LockFileEx.argtypes = [
 1073         ctypes.wintypes.HANDLE,     # hFile
 1074         ctypes.wintypes.DWORD,      # dwFlags
 1075         ctypes.wintypes.DWORD,      # dwReserved
 1076         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 1077         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 1078         ctypes.POINTER(OVERLAPPED)  # Overlapped
 1079     ]
 1080     LockFileEx.restype = ctypes.wintypes.BOOL
 1081     UnlockFileEx = kernel32.UnlockFileEx
 1082     UnlockFileEx.argtypes = [
 1083         ctypes.wintypes.HANDLE,     # hFile
 1084         ctypes.wintypes.DWORD,      # dwReserved
 1085         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 1086         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 1087         ctypes.POINTER(OVERLAPPED)  # Overlapped
 1088     ]
 1089     UnlockFileEx.restype = ctypes.wintypes.BOOL
 1090     whole_low = 0xffffffff
 1091     whole_high = 0x7fffffff
 1092 
 1093     def _lock_file(f, exclusive):
 1094         overlapped = OVERLAPPED()
 1095         overlapped.Offset = 0
 1096         overlapped.OffsetHigh = 0
 1097         overlapped.hEvent = 0
 1098         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 1099         handle = msvcrt.get_osfhandle(f.fileno())
 1100         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 1101                           whole_low, whole_high, f._lock_file_overlapped_p):
 1102             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 1103 
 1104     def _unlock_file(f):
 1105         assert f._lock_file_overlapped_p
 1106         handle = msvcrt.get_osfhandle(f.fileno())
 1107         if not UnlockFileEx(handle, 0,
 1108                             whole_low, whole_high, f._lock_file_overlapped_p):
 1109             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 1110 
 1111 else:
 1112     import fcntl
 1113 
 1114     def _lock_file(f, exclusive):
 1115         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 1116 
 1117     def _unlock_file(f):
 1118         fcntl.lockf(f, fcntl.LOCK_UN)
 1119 
 1120 
 1121 class locked_file(object):
 1122     def __init__(self, filename, mode, encoding=None):
 1123         assert mode in ['r', 'a', 'w']
 1124         self.f = io.open(filename, mode, encoding=encoding)
 1125         self.mode = mode
 1126 
 1127     def __enter__(self):
 1128         exclusive = self.mode != 'r'
 1129         try:
 1130             _lock_file(self.f, exclusive)
 1131         except IOError:
 1132             self.f.close()
 1133             raise
 1134         return self
 1135 
 1136     def __exit__(self, etype, value, traceback):
 1137         try:
 1138             _unlock_file(self.f)
 1139         finally:
 1140             self.f.close()
 1141 
 1142     def __iter__(self):
 1143         return iter(self.f)
 1144 
 1145     def write(self, *args):
 1146         return self.f.write(*args)
 1147 
 1148     def read(self, *args):
 1149         return self.f.read(*args)
 1150 
 1151 
 1152 def shell_quote(args):
 1153     quoted_args = []
 1154     encoding = sys.getfilesystemencoding()
 1155     if encoding is None:
 1156         encoding = 'utf-8'
 1157     for a in args:
 1158         if isinstance(a, bytes):
 1159             # We may get a filename encoded with 'encodeFilename'
 1160             a = a.decode(encoding)
 1161         quoted_args.append(pipes.quote(a))
 1162     return u' '.join(quoted_args)
 1163 
 1164 
 1165 def takewhile_inclusive(pred, seq):
 1166     """ Like itertools.takewhile, but include the latest evaluated element
 1167         (the first element so that Not pred(e)) """
 1168     for e in seq:
 1169         yield e
 1170         if not pred(e):
 1171             return
 1172 
 1173 
 1174 def smuggle_url(url, data):
 1175     """ Pass additional data in a URL for internal use. """
 1176 
 1177     sdata = compat_urllib_parse.urlencode(
 1178         {u'__youtubedl_smuggle': json.dumps(data)})
 1179     return url + u'#' + sdata
 1180 
 1181 
 1182 def unsmuggle_url(smug_url, default=None):
 1183     if not '#__youtubedl_smuggle' in smug_url:
 1184         return smug_url, default
 1185     url, _, sdata = smug_url.rpartition(u'#')
 1186     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
 1187     data = json.loads(jsond)
 1188     return url, data
 1189 
 1190 
 1191 def format_bytes(bytes):
 1192     if bytes is None:
 1193         return u'N/A'
 1194     if type(bytes) is str:
 1195         bytes = float(bytes)
 1196     if bytes == 0.0:
 1197         exponent = 0
 1198     else:
 1199         exponent = int(math.log(bytes, 1024.0))
 1200     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
 1201     converted = float(bytes) / float(1024 ** exponent)
 1202     return u'%.2f%s' % (converted, suffix)
 1203 
 1204 
 1205 def get_term_width():
 1206     columns = os.environ.get('COLUMNS', None)
 1207     if columns:
 1208         return int(columns)
 1209 
 1210     try:
 1211         sp = subprocess.Popen(
 1212             ['stty', 'size'],
 1213             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 1214         out, err = sp.communicate()
 1215         return int(out.split()[1])
 1216     except:
 1217         pass
 1218     return None
 1219 
 1220 
 1221 def month_by_name(name):
 1222     """ Return the number of a month by (locale-independently) English name """
 1223 
 1224     ENGLISH_NAMES = [
 1225         u'January', u'February', u'March', u'April', u'May', u'June',
 1226         u'July', u'August', u'September', u'October', u'November', u'December']
 1227     try:
 1228         return ENGLISH_NAMES.index(name) + 1
 1229     except ValueError:
 1230         return None
 1231 
 1232 
 1233 def fix_xml_ampersands(xml_str):
 1234     """Replace all the '&' by '&amp;' in XML"""
 1235     return re.sub(
 1236         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
 1237         u'&amp;',
 1238         xml_str)
 1239 
 1240 
 1241 def setproctitle(title):
 1242     assert isinstance(title, compat_str)
 1243     try:
 1244         libc = ctypes.cdll.LoadLibrary("libc.so.6")
 1245     except OSError:
 1246         return
 1247     title_bytes = title.encode('utf-8')
 1248     buf = ctypes.create_string_buffer(len(title_bytes))
 1249     buf.value = title_bytes
 1250     try:
 1251         libc.prctl(15, buf, 0, 0, 0)
 1252     except AttributeError:
 1253         return  # Strange libc, just skip this
 1254 
 1255 
 1256 def remove_start(s, start):
 1257     if s.startswith(start):
 1258         return s[len(start):]
 1259     return s
 1260 
 1261 
 1262 def url_basename(url):
 1263     path = compat_urlparse.urlparse(url).path
 1264     return path.strip(u'/').split(u'/')[-1]
 1265 
 1266 
 1267 class HEADRequest(compat_urllib_request.Request):
 1268     def get_method(self):
 1269         return "HEAD"
 1270 
 1271 
 1272 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
 1273     if get_attr:
 1274         if v is not None:
 1275             v = getattr(v, get_attr, None)
 1276     return default if v is None else (int(v) * invscale // scale)
 1277 
 1278 def str_or_none(v, default=None):
 1279     return default if v is None else compat_str(v)
 1280 
 1281 
 1282 def str_to_int(int_str):
 1283     if int_str is None:
 1284         return None
 1285     int_str = re.sub(r'[,\.]', u'', int_str)
 1286     return int(int_str)
 1287 
 1288 
 1289 def float_or_none(v, scale=1, invscale=1, default=None):
 1290     return default if v is None else (float(v) * invscale / scale)
 1291 
 1292 
 1293 def parse_duration(s):
 1294     if s is None:
 1295         return None
 1296 
 1297     m = re.match(
 1298         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
 1299     if not m:
 1300         return None
 1301     res = int(m.group('secs'))
 1302     if m.group('mins'):
 1303         res += int(m.group('mins')) * 60
 1304         if m.group('hours'):
 1305             res += int(m.group('hours')) * 60 * 60
 1306     return res
 1307 
 1308 
 1309 def prepend_extension(filename, ext):
 1310     name, real_ext = os.path.splitext(filename) 
 1311     return u'{0}.{1}{2}'.format(name, ext, real_ext)
 1312 
 1313 
 1314 def check_executable(exe, args=[]):
 1315     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
 1316     args can be a list of arguments for a short output (like -version) """
 1317     try:
 1318         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
 1319     except OSError:
 1320         return False
 1321     return exe
 1322 
 1323 
 1324 class PagedList(object):
 1325     def __init__(self, pagefunc, pagesize):
 1326         self._pagefunc = pagefunc
 1327         self._pagesize = pagesize
 1328 
 1329     def __len__(self):
 1330         # This is only useful for tests
 1331         return len(self.getslice())
 1332 
 1333     def getslice(self, start=0, end=None):
 1334         res = []
 1335         for pagenum in itertools.count(start // self._pagesize):
 1336             firstid = pagenum * self._pagesize
 1337             nextfirstid = pagenum * self._pagesize + self._pagesize
 1338             if start >= nextfirstid:
 1339                 continue
 1340 
 1341             page_results = list(self._pagefunc(pagenum))
 1342 
 1343             startv = (
 1344                 start % self._pagesize
 1345                 if firstid <= start < nextfirstid
 1346                 else 0)
 1347 
 1348             endv = (
 1349                 ((end - 1) % self._pagesize) + 1
 1350                 if (end is not None and firstid <= end <= nextfirstid)
 1351                 else None)
 1352 
 1353             if startv != 0 or endv is not None:
 1354                 page_results = page_results[startv:endv]
 1355             res.extend(page_results)
 1356 
 1357             # A little optimization - if current page is not "full", ie. does
 1358             # not contain page_size videos then we can assume that this page
 1359             # is the last one - there are no more ids on further pages -
 1360             # i.e. no need to query again.
 1361             if len(page_results) + startv < self._pagesize:
 1362                 break
 1363 
 1364             # If we got the whole page, but the next page is not interesting,
 1365             # break out early as well
 1366             if end == nextfirstid:
 1367                 break
 1368         return res
 1369 
 1370 
 1371 def uppercase_escape(s):
 1372     unicode_escape = codecs.getdecoder('unicode_escape')
 1373     return re.sub(
 1374         r'\\U[0-9a-fA-F]{8}',
 1375         lambda m: unicode_escape(m.group(0))[0],
 1376         s)
 1377 
 1378 try:
 1379     struct.pack(u'!I', 0)
 1380 except TypeError:
 1381     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
 1382     def struct_pack(spec, *args):
 1383         if isinstance(spec, compat_str):
 1384             spec = spec.encode('ascii')
 1385         return struct.pack(spec, *args)
 1386 
 1387     def struct_unpack(spec, *args):
 1388         if isinstance(spec, compat_str):
 1389             spec = spec.encode('ascii')
 1390         return struct.unpack(spec, *args)
 1391 else:
 1392     struct_pack = struct.pack
 1393     struct_unpack = struct.unpack
 1394 
 1395 
 1396 def read_batch_urls(batch_fd):
 1397     def fixup(url):
 1398         if not isinstance(url, compat_str):
 1399             url = url.decode('utf-8', 'replace')
 1400         BOM_UTF8 = u'\xef\xbb\xbf'
 1401         if url.startswith(BOM_UTF8):
 1402             url = url[len(BOM_UTF8):]
 1403         url = url.strip()
 1404         if url.startswith(('#', ';', ']')):
 1405             return False
 1406         return url
 1407 
 1408     with contextlib.closing(batch_fd) as fd:
 1409         return [url for url in map(fixup, fd) if url]
 1410 
 1411 
 1412 def urlencode_postdata(*args, **kargs):
 1413     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
 1414 
 1415 
 1416 def parse_xml(s):
 1417     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
 1418         def doctype(self, name, pubid, system):
 1419             pass  # Ignore doctypes
 1420 
 1421     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
 1422     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
 1423     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
 1424 
 1425 
 1426 if sys.version_info < (3, 0) and sys.platform == 'win32':
 1427     def compat_getpass(prompt, *args, **kwargs):
 1428         if isinstance(prompt, compat_str):
 1429             prompt = prompt.encode(preferredencoding())
 1430         return getpass.getpass(prompt, *args, **kwargs)
 1431 else:
 1432     compat_getpass = getpass.getpass
 1433 
 1434 
 1435 US_RATINGS = {
 1436     'G': 0,
 1437     'PG': 10,
 1438     'PG-13': 13,
 1439     'R': 16,
 1440     'NC': 18,
 1441 }
 1442 
 1443 
 1444 def strip_jsonp(code):
 1445     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
 1446 
 1447 
 1448 def qualities(quality_ids):
 1449     """ Get a numeric quality value out of a list of possible values """
 1450     def q(qid):
 1451         try:
 1452             return quality_ids.index(qid)
 1453         except ValueError:
 1454             return -1
 1455     return q
 1456 
 1457 
 1458 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
 1459 
 1460 try:
 1461     subprocess_check_output = subprocess.check_output
 1462 except AttributeError:
 1463     def subprocess_check_output(*args, **kwargs):
 1464         assert 'input' not in kwargs
 1465         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
 1466         output, _ = p.communicate()
 1467         ret = p.poll()
 1468         if ret:
 1469             raise subprocess.CalledProcessError(ret, p.args, output=output)
 1470         return output