youtube_dl/utils.py



    1 #!/usr/bin/env python
    2 # -*- coding: utf-8 -*-
    3 
    4 import gzip
    5 import io
    6 import json
    7 import locale
    8 import os
    9 import re
   10 import sys
   11 import traceback
   12 import zlib
   13 import email.utils
   14 import json
   15 import datetime
   16 
   17 try:
   18     import urllib.request as compat_urllib_request
   19 except ImportError: # Python 2
   20     import urllib2 as compat_urllib_request
   21 
   22 try:
   23     import urllib.error as compat_urllib_error
   24 except ImportError: # Python 2
   25     import urllib2 as compat_urllib_error
   26 
   27 try:
   28     import urllib.parse as compat_urllib_parse
   29 except ImportError: # Python 2
   30     import urllib as compat_urllib_parse
   31 
   32 try:
   33     from urllib.parse import urlparse as compat_urllib_parse_urlparse
   34 except ImportError: # Python 2
   35     from urlparse import urlparse as compat_urllib_parse_urlparse
   36 
   37 try:
   38     import http.cookiejar as compat_cookiejar
   39 except ImportError: # Python 2
   40     import cookielib as compat_cookiejar
   41 
   42 try:
   43     import html.entities as compat_html_entities
   44 except ImportError: # Python 2
   45     import htmlentitydefs as compat_html_entities
   46 
   47 try:
   48     import html.parser as compat_html_parser
   49 except ImportError: # Python 2
   50     import HTMLParser as compat_html_parser
   51 
   52 try:
   53     import http.client as compat_http_client
   54 except ImportError: # Python 2
   55     import httplib as compat_http_client
   56 
   57 try:
   58     from subprocess import DEVNULL
   59     compat_subprocess_get_DEVNULL = lambda: DEVNULL
   60 except ImportError:
   61     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
   62 
   63 try:
   64     from urllib.parse import parse_qs as compat_parse_qs
   65 except ImportError: # Python 2
   66     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
   67     # Python 2's version is apparently totally broken
   68     def _unquote(string, encoding='utf-8', errors='replace'):
   69         if string == '':
   70             return string
   71         res = string.split('%')
   72         if len(res) == 1:
   73             return string
   74         if encoding is None:
   75             encoding = 'utf-8'
   76         if errors is None:
   77             errors = 'replace'
   78         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
   79         pct_sequence = b''
   80         string = res[0]
   81         for item in res[1:]:
   82             try:
   83                 if not item:
   84                     raise ValueError
   85                 pct_sequence += item[:2].decode('hex')
   86                 rest = item[2:]
   87                 if not rest:
   88                     # This segment was just a single percent-encoded character.
   89                     # May be part of a sequence of code units, so delay decoding.
   90                     # (Stored in pct_sequence).
   91                     continue
   92             except ValueError:
   93                 rest = '%' + item
   94             # Encountered non-percent-encoded characters. Flush the current
   95             # pct_sequence.
   96             string += pct_sequence.decode(encoding, errors) + rest
   97             pct_sequence = b''
   98         if pct_sequence:
   99             # Flush the final pct_sequence
  100             string += pct_sequence.decode(encoding, errors)
  101         return string
  102 
  103     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
  104                 encoding='utf-8', errors='replace'):
  105         qs, _coerce_result = qs, unicode
  106         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
  107         r = []
  108         for name_value in pairs:
  109             if not name_value and not strict_parsing:
  110                 continue
  111             nv = name_value.split('=', 1)
  112             if len(nv) != 2:
  113                 if strict_parsing:
  114                     raise ValueError("bad query field: %r" % (name_value,))
  115                 # Handle case of a control-name with no equal sign
  116                 if keep_blank_values:
  117                     nv.append('')
  118                 else:
  119                     continue
  120             if len(nv[1]) or keep_blank_values:
  121                 name = nv[0].replace('+', ' ')
  122                 name = _unquote(name, encoding=encoding, errors=errors)
  123                 name = _coerce_result(name)
  124                 value = nv[1].replace('+', ' ')
  125                 value = _unquote(value, encoding=encoding, errors=errors)
  126                 value = _coerce_result(value)
  127                 r.append((name, value))
  128         return r
  129 
  130     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
  131                 encoding='utf-8', errors='replace'):
  132         parsed_result = {}
  133         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
  134                         encoding=encoding, errors=errors)
  135         for name, value in pairs:
  136             if name in parsed_result:
  137                 parsed_result[name].append(value)
  138             else:
  139                 parsed_result[name] = [value]
  140         return parsed_result
  141 
  142 try:
  143     compat_str = unicode # Python 2
  144 except NameError:
  145     compat_str = str
  146 
  147 try:
  148     compat_chr = unichr # Python 2
  149 except NameError:
  150     compat_chr = chr
  151 
  152 std_headers = {
  153     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  154     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  155     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  156     'Accept-Encoding': 'gzip, deflate',
  157     'Accept-Language': 'en-us,en;q=0.5',
  158 }
  159 
  160 def preferredencoding():
  161     """Get preferred encoding.
  162 
  163     Returns the best encoding scheme for the system, based on
  164     locale.getpreferredencoding() and some further tweaks.
  165     """
  166     try:
  167         pref = locale.getpreferredencoding()
  168         u'TEST'.encode(pref)
  169     except:
  170         pref = 'UTF-8'
  171 
  172     return pref
  173 
  174 if sys.version_info < (3,0):
  175     def compat_print(s):
  176         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
  177 else:
  178     def compat_print(s):
  179         assert type(s) == type(u'')
  180         print(s)
  181 
  182 # In Python 2.x, json.dump expects a bytestream.
  183 # In Python 3.x, it writes to a character stream
  184 if sys.version_info < (3,0):
  185     def write_json_file(obj, fn):
  186         with open(fn, 'wb') as f:
  187             json.dump(obj, f)
  188 else:
  189     def write_json_file(obj, fn):
  190         with open(fn, 'w', encoding='utf-8') as f:
  191             json.dump(obj, f)
  192 
  193 def htmlentity_transform(matchobj):
  194     """Transforms an HTML entity to a character.
  195 
  196     This function receives a match object and is intended to be used with
  197     the re.sub() function.
  198     """
  199     entity = matchobj.group(1)
  200 
  201     # Known non-numeric HTML entity
  202     if entity in compat_html_entities.name2codepoint:
  203         return compat_chr(compat_html_entities.name2codepoint[entity])
  204 
  205     mobj = re.match(u'(?u)#(x?\\d+)', entity)
  206     if mobj is not None:
  207         numstr = mobj.group(1)
  208         if numstr.startswith(u'x'):
  209             base = 16
  210             numstr = u'0%s' % numstr
  211         else:
  212             base = 10
  213         return compat_chr(int(numstr, base))
  214 
  215     # Unknown entity in name, return its literal representation
  216     return (u'&%s;' % entity)
  217 
  218 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  219 class AttrParser(compat_html_parser.HTMLParser):
  220     """Modified HTMLParser that isolates a tag with the specified attribute"""
  221     def __init__(self, attribute, value):
  222         self.attribute = attribute
  223         self.value = value
  224         self.result = None
  225         self.started = False
  226         self.depth = {}
  227         self.html = None
  228         self.watch_startpos = False
  229         self.error_count = 0
  230         compat_html_parser.HTMLParser.__init__(self)
  231 
  232     def error(self, message):
  233         if self.error_count > 10 or self.started:
  234             raise compat_html_parser.HTMLParseError(message, self.getpos())
  235         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  236         self.error_count += 1
  237         self.goahead(1)
  238 
  239     def loads(self, html):
  240         self.html = html
  241         self.feed(html)
  242         self.close()
  243 
  244     def handle_starttag(self, tag, attrs):
  245         attrs = dict(attrs)
  246         if self.started:
  247             self.find_startpos(None)
  248         if self.attribute in attrs and attrs[self.attribute] == self.value:
  249             self.result = [tag]
  250             self.started = True
  251             self.watch_startpos = True
  252         if self.started:
  253             if not tag in self.depth: self.depth[tag] = 0
  254             self.depth[tag] += 1
  255 
  256     def handle_endtag(self, tag):
  257         if self.started:
  258             if tag in self.depth: self.depth[tag] -= 1
  259             if self.depth[self.result[0]] == 0:
  260                 self.started = False
  261                 self.result.append(self.getpos())
  262 
  263     def find_startpos(self, x):
  264         """Needed to put the start position of the result (self.result[1])
  265         after the opening tag with the requested id"""
  266         if self.watch_startpos:
  267             self.watch_startpos = False
  268             self.result.append(self.getpos())
  269     handle_entityref = handle_charref = handle_data = handle_comment = \
  270     handle_decl = handle_pi = unknown_decl = find_startpos
  271 
  272     def get_result(self):
  273         if self.result is None:
  274             return None
  275         if len(self.result) != 3:
  276             return None
  277         lines = self.html.split('\n')
  278         lines = lines[self.result[1][0]-1:self.result[2][0]]
  279         lines[0] = lines[0][self.result[1][1]:]
  280         if len(lines) == 1:
  281             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
  282         lines[-1] = lines[-1][:self.result[2][1]]
  283         return '\n'.join(lines).strip()
  284 # Hack for https://github.com/rg3/youtube-dl/issues/662
  285 if sys.version_info < (2, 7, 3):
  286     AttrParser.parse_endtag = (lambda self, i:
  287         i + len("</scr'+'ipt>")
  288         if self.rawdata[i:].startswith("</scr'+'ipt>")
  289         else compat_html_parser.HTMLParser.parse_endtag(self, i))
  290 
  291 def get_element_by_id(id, html):
  292     """Return the content of the tag with the specified ID in the passed HTML document"""
  293     return get_element_by_attribute("id", id, html)
  294 
  295 def get_element_by_attribute(attribute, value, html):
  296     """Return the content of the tag with the specified attribute in the passed HTML document"""
  297     parser = AttrParser(attribute, value)
  298     try:
  299         parser.loads(html)
  300     except compat_html_parser.HTMLParseError:
  301         pass
  302     return parser.get_result()
  303 
  304 
  305 def clean_html(html):
  306     """Clean an HTML snippet into a readable string"""
  307     # Newline vs <br />
  308     html = html.replace('\n', ' ')
  309     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
  310     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
  311     # Strip html tags
  312     html = re.sub('<.*?>', '', html)
  313     # Replace html entities
  314     html = unescapeHTML(html)
  315     return html.strip()
  316 
  317 
  318 def sanitize_open(filename, open_mode):
  319     """Try to open the given filename, and slightly tweak it if this fails.
  320 
  321     Attempts to open the given filename. If this fails, it tries to change
  322     the filename slightly, step by step, until it's either able to open it
  323     or it fails and raises a final exception, like the standard open()
  324     function.
  325 
  326     It returns the tuple (stream, definitive_file_name).
  327     """
  328     try:
  329         if filename == u'-':
  330             if sys.platform == 'win32':
  331                 import msvcrt
  332                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  333             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  334         stream = open(encodeFilename(filename), open_mode)
  335         return (stream, filename)
  336     except (IOError, OSError) as err:
  337         # In case of error, try to remove win32 forbidden chars
  338         filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
  339 
  340         # An exception here should be caught in the caller
  341         stream = open(encodeFilename(filename), open_mode)
  342         return (stream, filename)
  343 
  344 
  345 def timeconvert(timestr):
  346     """Convert RFC 2822 defined time string into system timestamp"""
  347     timestamp = None
  348     timetuple = email.utils.parsedate_tz(timestr)
  349     if timetuple is not None:
  350         timestamp = email.utils.mktime_tz(timetuple)
  351     return timestamp
  352 
  353 def sanitize_filename(s, restricted=False, is_id=False):
  354     """Sanitizes a string so it could be used as part of a filename.
  355     If restricted is set, use a stricter subset of allowed characters.
  356     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
  357     """
  358     def replace_insane(char):
  359         if char == '?' or ord(char) < 32 or ord(char) == 127:
  360             return ''
  361         elif char == '"':
  362             return '' if restricted else '\''
  363         elif char == ':':
  364             return '_-' if restricted else ' -'
  365         elif char in '\\/|*<>':
  366             return '_'
  367         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
  368             return '_'
  369         if restricted and ord(char) > 127:
  370             return '_'
  371         return char
  372 
  373     result = u''.join(map(replace_insane, s))
  374     if not is_id:
  375         while '__' in result:
  376             result = result.replace('__', '_')
  377         result = result.strip('_')
  378         # Common case of "Foreign band name - English song title"
  379         if restricted and result.startswith('-_'):
  380             result = result[2:]
  381         if not result:
  382             result = '_'
  383     return result
  384 
  385 def orderedSet(iterable):
  386     """ Remove all duplicates from the input iterable """
  387     res = []
  388     for el in iterable:
  389         if el not in res:
  390             res.append(el)
  391     return res
  392 
  393 def unescapeHTML(s):
  394     """
  395     @param s a string
  396     """
  397     assert type(s) == type(u'')
  398 
  399     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
  400     return result
  401 
  402 def encodeFilename(s):
  403     """
  404     @param s The name of the file
  405     """
  406 
  407     assert type(s) == type(u'')
  408 
  409     # Python 3 has a Unicode API
  410     if sys.version_info >= (3, 0):
  411         return s
  412 
  413     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
  414         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
  415         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
  416         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
  417         return s
  418     else:
  419         encoding = sys.getfilesystemencoding()
  420         if encoding is None:
  421             encoding = 'utf-8'
  422         return s.encode(encoding, 'ignore')
  423 
  424 def decodeOption(optval):
  425     if optval is None:
  426         return optval
  427     if isinstance(optval, bytes):
  428         optval = optval.decode(preferredencoding())
  429 
  430     assert isinstance(optval, compat_str)
  431     return optval
  432 
  433 class ExtractorError(Exception):
  434     """Error during info extraction."""
  435     def __init__(self, msg, tb=None):
  436         """ tb, if given, is the original traceback (so that it can be printed out). """
  437         super(ExtractorError, self).__init__(msg)
  438         self.traceback = tb
  439         self.exc_info = sys.exc_info()  # preserve original exception
  440 
  441     def format_traceback(self):
  442         if self.traceback is None:
  443             return None
  444         return u''.join(traceback.format_tb(self.traceback))
  445 
  446 
  447 class DownloadError(Exception):
  448     """Download Error exception.
  449 
  450     This exception may be thrown by FileDownloader objects if they are not
  451     configured to continue on errors. They will contain the appropriate
  452     error message.
  453     """
  454     def __init__(self, msg, exc_info=None):
  455         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  456         super(DownloadError, self).__init__(msg)
  457         self.exc_info = exc_info
  458 
  459 
  460 class SameFileError(Exception):
  461     """Same File exception.
  462 
  463     This exception will be thrown by FileDownloader objects if they detect
  464     multiple files would have to be downloaded to the same file on disk.
  465     """
  466     pass
  467 
  468 
  469 class PostProcessingError(Exception):
  470     """Post Processing exception.
  471 
  472     This exception may be raised by PostProcessor's .run() method to
  473     indicate an error in the postprocessing task.
  474     """
  475     def __init__(self, msg):
  476         self.msg = msg
  477 
  478 class MaxDownloadsReached(Exception):
  479     """ --max-downloads limit has been reached. """
  480     pass
  481 
  482 
  483 class UnavailableVideoError(Exception):
  484     """Unavailable Format exception.
  485 
  486     This exception will be thrown when a video is requested
  487     in a format that is not available for that video.
  488     """
  489     pass
  490 
  491 
  492 class ContentTooShortError(Exception):
  493     """Content Too Short exception.
  494 
  495     This exception may be raised by FileDownloader objects when a file they
  496     download is too small for what the server announced first, indicating
  497     the connection was probably interrupted.
  498     """
  499     # Both in bytes
  500     downloaded = None
  501     expected = None
  502 
  503     def __init__(self, downloaded, expected):
  504         self.downloaded = downloaded
  505         self.expected = expected
  506 
  507 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
  508     """Handler for HTTP requests and responses.
  509 
  510     This class, when installed with an OpenerDirector, automatically adds
  511     the standard headers to every HTTP request and handles gzipped and
  512     deflated responses from web servers. If compression is to be avoided in
  513     a particular request, the original request in the program code only has
  514     to include the HTTP header "Youtubedl-No-Compression", which will be
  515     removed before making the real request.
  516 
  517     Part of this code was copied from:
  518 
  519     http://techknack.net/python-urllib2-handlers/
  520 
  521     Andrew Rowls, the author of that code, agreed to release it to the
  522     public domain.
  523     """
  524 
  525     @staticmethod
  526     def deflate(data):
  527         try:
  528             return zlib.decompress(data, -zlib.MAX_WBITS)
  529         except zlib.error:
  530             return zlib.decompress(data)
  531 
  532     @staticmethod
  533     def addinfourl_wrapper(stream, headers, url, code):
  534         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
  535             return compat_urllib_request.addinfourl(stream, headers, url, code)
  536         ret = compat_urllib_request.addinfourl(stream, headers, url)
  537         ret.code = code
  538         return ret
  539 
  540     def http_request(self, req):
  541         for h,v in std_headers.items():
  542             if h in req.headers:
  543                 del req.headers[h]
  544             req.add_header(h, v)
  545         if 'Youtubedl-no-compression' in req.headers:
  546             if 'Accept-encoding' in req.headers:
  547                 del req.headers['Accept-encoding']
  548             del req.headers['Youtubedl-no-compression']
  549         if 'Youtubedl-user-agent' in req.headers:
  550             if 'User-agent' in req.headers:
  551                 del req.headers['User-agent']
  552             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
  553             del req.headers['Youtubedl-user-agent']
  554         return req
  555 
  556     def http_response(self, req, resp):
  557         old_resp = resp
  558         # gzip
  559         if resp.headers.get('Content-encoding', '') == 'gzip':
  560             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
  561             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
  562             resp.msg = old_resp.msg
  563         # deflate
  564         if resp.headers.get('Content-encoding', '') == 'deflate':
  565             gz = io.BytesIO(self.deflate(resp.read()))
  566             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
  567             resp.msg = old_resp.msg
  568         return resp
  569 
  570     https_request = http_request
  571     https_response = http_response
  572 
  573 def unified_strdate(date_str):
  574     """Return a string with the date in the format YYYYMMDD"""
  575     upload_date = None
  576     #Replace commas
  577     date_str = date_str.replace(',',' ')
  578     # %z (UTC offset) is only supported in python>=3.2
  579     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
  580     format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
  581     for expression in format_expressions:
  582         try:
  583             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
  584         except:
  585             pass
  586     return upload_date
  587 
  588 def date_from_str(date_str):
  589     """Return a datetime object from a string in the format YYYYMMDD"""
  590     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
  591     
  592 class DateRange(object):
  593     """Represents a time interval between two dates"""
  594     def __init__(self, start=None, end=None):
  595         """start and end must be strings in the format accepted by date"""
  596         if start is not None:
  597             self.start = date_from_str(start)
  598         else:
  599             self.start = datetime.datetime.min.date()
  600         if end is not None:
  601             self.end = date_from_str(end)
  602         else:
  603             self.end = datetime.datetime.max.date()
  604         if self.start >= self.end:
  605             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
  606     @classmethod
  607     def day(cls, day):
  608         """Returns a range that only contains the given day"""
  609         return cls(day,day)
  610     def __contains__(self, date):
  611         """Check if the date is in the range"""
  612         date = date_from_str(date)
  613         return self.start <= date and date <= self.end
  614     def __str__(self):
  615         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())