summaryrefslogtreecommitdiff
path: root/youtube_dl/jsinterp.py
blob: 48c27a1c04b0df6b4faa8833a16abb8a45ec6fe3 (plain)
    1 from __future__ import unicode_literals
    2 
    3 import itertools
    4 import json
    5 import math
    6 import operator
    7 import re
    8 
    9 from .utils import (
   10     error_to_compat_str,
   11     ExtractorError,
   12     js_to_json,
   13     remove_quotes,
   14     unified_timestamp,
   15 )
   16 from .compat import (
   17     compat_collections_chain_map as ChainMap,
   18     compat_itertools_zip_longest as zip_longest,
   19     compat_str,
   20 )
   21 
   22 _NAME_RE = r'[a-zA-Z_$][\w$]*'
   23 
   24 _UNDEFINED = object()
   25 
   26 
   27 def _js_bit_op(op):
   28 
   29     def wrapped(a, b):
   30         def zeroise(x):
   31             return 0 if x in (None, _UNDEFINED) else x
   32         return op(zeroise(a), zeroise(b))
   33 
   34     return wrapped
   35 
   36 
   37 def _js_arith_op(op):
   38 
   39     def wrapped(a, b):
   40         if _UNDEFINED in (a, b):
   41             return float('nan')
   42         return op(a or 0, b or 0)
   43 
   44     return wrapped
   45 
   46 
   47 def _js_div(a, b):
   48     if _UNDEFINED in (a, b) or not (a and b):
   49         return float('nan')
   50     return float('inf') if not b else operator.truediv(a or 0, b)
   51 
   52 
   53 def _js_mod(a, b):
   54     if _UNDEFINED in (a, b) or not b:
   55         return float('nan')
   56     return (a or 0) % b
   57 
   58 
   59 def _js_exp(a, b):
   60     if not b:
   61         # even 0 ** 0 !!
   62         return 1
   63     if _UNDEFINED in (a, b):
   64         return float('nan')
   65     return (a or 0) ** b
   66 
   67 
   68 def _js_eq_op(op):
   69 
   70     def wrapped(a, b):
   71         if set((a, b)) <= set((None, _UNDEFINED)):
   72             return op(a, a)
   73         return op(a, b)
   74 
   75     return wrapped
   76 
   77 
   78 def _js_comp_op(op):
   79 
   80     def wrapped(a, b):
   81         if _UNDEFINED in (a, b):
   82             return False
   83         return op(a or 0, b or 0)
   84 
   85     return wrapped
   86 
   87 
   88 # (op, definition) in order of binding priority, tightest first
   89 # avoid dict to maintain order
   90 # definition None => Defined in JSInterpreter._operator
   91 _DOT_OPERATORS = (
   92     ('.', None),
   93     # TODO: ('?.', None),
   94 )
   95 
   96 _OPERATORS = (
   97     ('>>', _js_bit_op(operator.rshift)),
   98     ('<<', _js_bit_op(operator.lshift)),
   99     ('+', _js_arith_op(operator.add)),
  100     ('-', _js_arith_op(operator.sub)),
  101     ('*', _js_arith_op(operator.mul)),
  102     ('/', _js_div),
  103     ('%', _js_mod),
  104     ('**', _js_exp),
  105 )
  106 
  107 _COMP_OPERATORS = (
  108     ('===', operator.is_),
  109     ('==', _js_eq_op(operator.eq)),
  110     ('!==', operator.is_not),
  111     ('!=', _js_eq_op(operator.ne)),
  112     ('<=', _js_comp_op(operator.le)),
  113     ('>=', _js_comp_op(operator.ge)),
  114     ('<', _js_comp_op(operator.lt)),
  115     ('>', _js_comp_op(operator.gt)),
  116 )
  117 
  118 _LOG_OPERATORS = (
  119     ('|', _js_bit_op(operator.or_)),
  120     ('^', _js_bit_op(operator.xor)),
  121     ('&', _js_bit_op(operator.and_)),
  122 )
  123 
  124 _SC_OPERATORS = (
  125     ('?', None),
  126     ('??', None),
  127     ('||', None),
  128     ('&&', None),
  129 )
  130 
  131 _OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS))
  132 
  133 _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]')))
  134 _QUOTES = '\'"/'
  135 
  136 
  137 def _ternary(cndn, if_true=True, if_false=False):
  138     """Simulate JS's ternary operator (cndn?if_true:if_false)"""
  139     if cndn in (False, None, 0, '', _UNDEFINED):
  140         return if_false
  141     try:
  142         if math.isnan(cndn):  # NB: NaN cannot be checked by membership
  143             return if_false
  144     except TypeError:
  145         pass
  146     return if_true
  147 
  148 
  149 class JS_Break(ExtractorError):
  150     def __init__(self):
  151         ExtractorError.__init__(self, 'Invalid break')
  152 
  153 
  154 class JS_Continue(ExtractorError):
  155     def __init__(self):
  156         ExtractorError.__init__(self, 'Invalid continue')
  157 
  158 
  159 class JS_Throw(ExtractorError):
  160     def __init__(self, e):
  161         self.error = e
  162         ExtractorError.__init__(self, 'Uncaught exception ' + error_to_compat_str(e))
  163 
  164 
  165 class LocalNameSpace(ChainMap):
  166     def __getitem__(self, key):
  167         try:
  168             return super(LocalNameSpace, self).__getitem__(key)
  169         except KeyError:
  170             return _UNDEFINED
  171 
  172     def __setitem__(self, key, value):
  173         for scope in self.maps:
  174             if key in scope:
  175                 scope[key] = value
  176                 return
  177         self.maps[0][key] = value
  178 
  179     def __delitem__(self, key):
  180         raise NotImplementedError('Deleting is not supported')
  181 
  182     # except
  183     def pop(self, key, *args):
  184         try:
  185             off = self.__getitem__(key)
  186             super(LocalNameSpace, self).__delitem__(key)
  187             return off
  188         except KeyError:
  189             if len(args) > 0:
  190                 return args[0]
  191             raise
  192 
  193     def __contains__(self, key):
  194         try:
  195             super(LocalNameSpace, self).__getitem__(key)
  196             return True
  197         except KeyError:
  198             return False
  199 
  200     def __repr__(self):
  201         return 'LocalNameSpace%s' % (self.maps, )
  202 
  203 
  204 class JSInterpreter(object):
  205     __named_object_counter = 0
  206 
  207     undefined = _UNDEFINED
  208 
  209     RE_FLAGS = {
  210         # special knowledge: Python's re flags are bitmask values, current max 128
  211         # invent new bitmask values well above that for literal parsing
  212         # TODO: new pattern class to execute matches with these flags
  213         'd': 1024,  # Generate indices for substring matches
  214         'g': 2048,  # Global search
  215         'i': re.I,  # Case-insensitive search
  216         'm': re.M,  # Multi-line search
  217         's': re.S,  # Allows . to match newline characters
  218         'u': re.U,  # Treat a pattern as a sequence of unicode code points
  219         'y': 4096,  # Perform a "sticky" search that matches starting at the current position in the target string
  220     }
  221 
  222     _EXC_NAME = '__youtube_dl_exception__'
  223     _OBJ_NAME = '__youtube_dl_jsinterp_obj'
  224 
  225     OP_CHARS = None
  226 
  227     def __init__(self, code, objects=None):
  228         self.code, self._functions = code, {}
  229         self._objects = {} if objects is None else objects
  230         if type(self).OP_CHARS is None:
  231             type(self).OP_CHARS = self.OP_CHARS = self.__op_chars()
  232 
  233     class Exception(ExtractorError):
  234         def __init__(self, msg, *args, **kwargs):
  235             expr = kwargs.pop('expr', None)
  236             if expr is not None:
  237                 msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100])
  238             super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs)
  239 
  240     @classmethod
  241     def __op_chars(cls):
  242         op_chars = set(';,')
  243         for op in cls._all_operators():
  244             for c in op[0]:
  245                 op_chars.add(c)
  246         return op_chars
  247 
  248     def _named_object(self, namespace, obj):
  249         self.__named_object_counter += 1
  250         name = '%s%d' % (self._OBJ_NAME, self.__named_object_counter)
  251         namespace[name] = obj
  252         return name
  253 
  254     @classmethod
  255     def _regex_flags(cls, expr):
  256         flags = 0
  257         if not expr:
  258             return flags, expr
  259         for idx, ch in enumerate(expr):
  260             if ch not in cls.RE_FLAGS:
  261                 break
  262             flags |= cls.RE_FLAGS[ch]
  263         return flags, expr[idx:] if idx > 0 else expr
  264 
  265     @classmethod
  266     def _separate(cls, expr, delim=',', max_split=None, skip_delims=None):
  267         if not expr:
  268             return
  269         counters = {k: 0 for k in _MATCHING_PARENS.values()}
  270         start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
  271         in_quote, escaping, skipping = None, False, 0
  272         after_op, in_regex_char_group, skip_re = True, False, 0
  273 
  274         for idx, char in enumerate(expr):
  275             if skip_re > 0:
  276                 skip_re -= 1
  277                 continue
  278             if not in_quote:
  279                 if char in _MATCHING_PARENS:
  280                     counters[_MATCHING_PARENS[char]] += 1
  281                 elif char in counters:
  282                     counters[char] -= 1
  283             if not escaping and char in _QUOTES and in_quote in (char, None):
  284                 if in_quote or after_op or char != '/':
  285                     in_quote = None if in_quote and not in_regex_char_group else char
  286                     if in_quote is None and char == '/' and delim != '/':
  287                         # regexp flags
  288                         n_idx = idx + 1
  289                         while n_idx < len(expr) and expr[n_idx] in cls.RE_FLAGS:
  290                             n_idx += 1
  291                         skip_re = n_idx - idx - 1
  292                         if skip_re > 0:
  293                             continue
  294             elif in_quote == '/' and char in '[]':
  295                 in_regex_char_group = char == '['
  296             escaping = not escaping and in_quote and char == '\\'
  297             after_op = not in_quote and char in cls.OP_CHARS or (char == ' ' and after_op)
  298 
  299             if char != delim[pos] or any(counters.values()) or in_quote:
  300                 pos = skipping = 0
  301                 continue
  302             elif skipping > 0:
  303                 skipping -= 1
  304                 continue
  305             elif pos == 0 and skip_delims:
  306                 here = expr[idx:]
  307                 for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]:
  308                     if here.startswith(s) and s:
  309                         skipping = len(s) - 1
  310                         break
  311                 if skipping > 0:
  312                     continue
  313             if pos < delim_len:
  314                 pos += 1
  315                 continue
  316             yield expr[start: idx - delim_len]
  317             start, pos = idx + 1, 0
  318             splits += 1
  319             if max_split and splits >= max_split:
  320                 break
  321         yield expr[start:]
  322 
  323     @classmethod
  324     def _separate_at_paren(cls, expr, delim):
  325         separated = list(cls._separate(expr, delim, 1))
  326 
  327         if len(separated) < 2:
  328             raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals()))
  329         return separated[0][1:].strip(), separated[1].strip()
  330 
  331     @staticmethod
  332     def _all_operators():
  333         return itertools.chain(
  334             # Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence
  335             _SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS)
  336 
  337     def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion):
  338         if op in ('||', '&&'):
  339             if (op == '&&') ^ _ternary(left_val):
  340                 return left_val  # short circuiting
  341         elif op == '??':
  342             if left_val not in (None, self.undefined):
  343                 return left_val
  344         elif op == '?':
  345             right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1))
  346 
  347         right_val = self.interpret_expression(right_expr, local_vars, allow_recursion)
  348         opfunc = op and next((v for k, v in self._all_operators() if k == op), None)
  349         if not opfunc:
  350             return right_val
  351 
  352         try:
  353             return opfunc(left_val, right_val)
  354         except Exception as e:
  355             raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e)
  356 
  357     def _index(self, obj, idx, allow_undefined=False):
  358         if idx == 'length':
  359             return len(obj)
  360         try:
  361             return obj[int(idx)] if isinstance(obj, list) else obj[idx]
  362         except Exception as e:
  363             if allow_undefined:
  364                 return self.undefined
  365             raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e)
  366 
  367     def _dump(self, obj, namespace):
  368         try:
  369             return json.dumps(obj)
  370         except TypeError:
  371             return self._named_object(namespace, obj)
  372 
  373     def interpret_statement(self, stmt, local_vars, allow_recursion=100):
  374         if allow_recursion < 0:
  375             raise self.Exception('Recursion limit reached')
  376         allow_recursion -= 1
  377 
  378         should_return = False
  379         sub_statements = list(self._separate(stmt, ';')) or ['']
  380         expr = stmt = sub_statements.pop().strip()
  381         for sub_stmt in sub_statements:
  382             ret, should_return = self.interpret_statement(sub_stmt, local_vars, allow_recursion)
  383             if should_return:
  384                 return ret, should_return
  385 
  386         m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt)
  387         if m:
  388             expr = stmt[len(m.group(0)):].strip()
  389             if m.group('throw'):
  390                 raise JS_Throw(self.interpret_expression(expr, local_vars, allow_recursion))
  391             should_return = not m.group('var')
  392         if not expr:
  393             return None, should_return
  394 
  395         if expr[0] in _QUOTES:
  396             inner, outer = self._separate(expr, expr[0], 1)
  397             if expr[0] == '/':
  398                 flags, _ = self._regex_flags(outer)
  399                 inner, outer = inner.replace('"', r'\"'), ''
  400                 inner = re.compile(js_to_json(inner + expr[0]), flags=flags)  # , strict=True))
  401             else:
  402                 inner = json.loads(js_to_json(inner + expr[0]))  # , strict=True))
  403             if not outer:
  404                 return inner, should_return
  405             expr = self._named_object(local_vars, inner) + outer
  406 
  407         if expr.startswith('new '):
  408             obj = expr[4:]
  409             if obj.startswith('Date('):
  410                 left, right = self._separate_at_paren(obj[4:], ')')
  411                 expr = unified_timestamp(
  412                     self.interpret_expression(left, local_vars, allow_recursion), False)
  413                 if not expr:
  414                     raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr)
  415                 expr = self._dump(int(expr * 1000), local_vars) + right
  416             else:
  417                 raise self.Exception('Unsupported object {obj}'.format(**locals()), expr=expr)
  418 
  419         if expr.startswith('void '):
  420             left = self.interpret_expression(expr[5:], local_vars, allow_recursion)
  421             return None, should_return
  422 
  423         if expr.startswith('{'):
  424             inner, outer = self._separate_at_paren(expr, '}')
  425             # try for object expression
  426             sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)]
  427             if all(len(sub_expr) == 2 for sub_expr in sub_expressions):
  428                 return dict(
  429                     (key_expr if re.match(_NAME_RE, key_expr) else key_expr,
  430                      self.interpret_expression(val_expr, local_vars, allow_recursion))
  431                     for key_expr, val_expr in sub_expressions), should_return
  432             # or statement list
  433             inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion)
  434             if not outer or should_abort:
  435                 return inner, should_abort or should_return
  436             else:
  437                 expr = self._dump(inner, local_vars) + outer
  438 
  439         if expr.startswith('('):
  440             inner, outer = self._separate_at_paren(expr, ')')
  441             inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion)
  442             if not outer or should_abort:
  443                 return inner, should_abort or should_return
  444             else:
  445                 expr = self._dump(inner, local_vars) + outer
  446 
  447         if expr.startswith('['):
  448             inner, outer = self._separate_at_paren(expr, ']')
  449             name = self._named_object(local_vars, [
  450                 self.interpret_expression(item, local_vars, allow_recursion)
  451                 for item in self._separate(inner)])
  452             expr = name + outer
  453 
  454         m = re.match(r'''(?x)
  455             (?P<try>try|finally)\s*|
  456             (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))|
  457             (?P<switch>switch)\s*\(|
  458             (?P<for>for)\s*\(|'''.format(**globals()), expr)
  459         md = m.groupdict() if m else {}
  460         if md.get('try'):
  461             if expr[m.end()] == '{':
  462                 try_expr, expr = self._separate_at_paren(expr[m.end():], '}')
  463             else:
  464                 try_expr, expr = expr[m.end() - 1:], ''
  465             try:
  466                 ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion)
  467                 if should_abort:
  468                     return ret, True
  469             except JS_Throw as e:
  470                 local_vars[self._EXC_NAME] = e.error
  471             except Exception as e:
  472                 # XXX: This works for now, but makes debugging future issues very hard
  473                 local_vars[self._EXC_NAME] = e
  474             ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
  475             return ret, should_abort or should_return
  476 
  477         elif md.get('catch'):
  478             catch_expr, expr = self._separate_at_paren(expr[m.end():], '}')
  479             if self._EXC_NAME in local_vars:
  480                 catch_vars = local_vars.new_child({m.group('err'): local_vars.pop(self._EXC_NAME)})
  481                 ret, should_abort = self.interpret_statement(catch_expr, catch_vars, allow_recursion)
  482                 if should_abort:
  483                     return ret, True
  484 
  485             ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
  486             return ret, should_abort or should_return
  487 
  488         elif md.get('for'):
  489             constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
  490             if remaining.startswith('{'):
  491                 body, expr = self._separate_at_paren(remaining, '}')
  492             else:
  493                 switch_m = re.match(r'switch\s*\(', remaining)  # FIXME
  494                 if switch_m:
  495                     switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:], ')')
  496                     body, expr = self._separate_at_paren(remaining, '}')
  497                     body = 'switch(%s){%s}' % (switch_val, body)
  498                 else:
  499                     body, expr = remaining, ''
  500             start, cndn, increment = self._separate(constructor, ';')
  501             self.interpret_expression(start, local_vars, allow_recursion)
  502             while True:
  503                 if not _ternary(self.interpret_expression(cndn, local_vars, allow_recursion)):
  504                     break
  505                 try:
  506                     ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion)
  507                     if should_abort:
  508                         return ret, True
  509                 except JS_Break:
  510                     break
  511                 except JS_Continue:
  512                     pass
  513                 self.interpret_expression(increment, local_vars, allow_recursion)
  514             ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
  515             return ret, should_abort or should_return
  516 
  517         elif md.get('switch'):
  518             switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
  519             switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion)
  520             body, expr = self._separate_at_paren(remaining, '}')
  521             items = body.replace('default:', 'case default:').split('case ')[1:]
  522             for default in (False, True):
  523                 matched = False
  524                 for item in items:
  525                     case, stmt = (i.strip() for i in self._separate(item, ':', 1))
  526                     if default:
  527                         matched = matched or case == 'default'
  528                     elif not matched:
  529                         matched = (case != 'default'
  530                                    and switch_val == self.interpret_expression(case, local_vars, allow_recursion))
  531                     if not matched:
  532                         continue
  533                     try:
  534                         ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion)
  535                         if should_abort:
  536                             return ret
  537                     except JS_Break:
  538                         break
  539                 if matched:
  540                     break
  541             ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
  542             return ret, should_abort or should_return
  543 
  544         # Comma separated statements
  545         sub_expressions = list(self._separate(expr))
  546         if len(sub_expressions) > 1:
  547             for sub_expr in sub_expressions:
  548                 ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion)
  549                 if should_abort:
  550                     return ret, True
  551             return ret, False
  552 
  553         for m in re.finditer(r'''(?x)
  554                 (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})|
  555                 (?P<var2>{_NAME_RE})(?P<post_sign>\+\+|--)'''.format(**globals()), expr):
  556             var = m.group('var1') or m.group('var2')
  557             start, end = m.span()
  558             sign = m.group('pre_sign') or m.group('post_sign')
  559             ret = local_vars[var]
  560             local_vars[var] += 1 if sign[0] == '+' else -1
  561             if m.group('pre_sign'):
  562                 ret = local_vars[var]
  563             expr = expr[:start] + self._dump(ret, local_vars) + expr[end:]
  564 
  565         if not expr:
  566             return None, should_return
  567 
  568         m = re.match(r'''(?x)
  569             (?P<assign>
  570                 (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s*
  571                 (?P<op>{_OPERATOR_RE})?
  572                 =(?!=)(?P<expr>.*)$
  573             )|(?P<return>
  574                 (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$
  575             )|(?P<indexing>
  576                 (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$
  577             )|(?P<attribute>
  578                 (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s*
  579             )|(?P<function>
  580                 (?P<fname>{_NAME_RE})\((?P<args>.*)\)$
  581             )'''.format(**globals()), expr)
  582         md = m.groupdict() if m else {}
  583         if md.get('assign'):
  584             left_val = local_vars.get(m.group('out'))
  585 
  586             if not m.group('index'):
  587                 local_vars[m.group('out')] = self._operator(
  588                     m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion)
  589                 return local_vars[m.group('out')], should_return
  590             elif left_val in (None, self.undefined):
  591                 raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr)
  592 
  593             idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
  594             if not isinstance(idx, (int, float)):
  595                 raise self.Exception('List index %s must be integer' % (idx, ), expr=expr)
  596             idx = int(idx)
  597             left_val[idx] = self._operator(
  598                 m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion)
  599             return left_val[idx], should_return
  600 
  601         elif expr.isdigit():
  602             return int(expr), should_return
  603 
  604         elif expr == 'break':
  605             raise JS_Break()
  606         elif expr == 'continue':
  607             raise JS_Continue()
  608 
  609         elif expr == 'undefined':
  610             return self.undefined, should_return
  611 
  612         elif md.get('return'):
  613             return local_vars[m.group('name')], should_return
  614 
  615         try:
  616             ret = json.loads(js_to_json(expr))  # strict=True)
  617             if not md.get('attribute'):
  618                 return ret, should_return
  619         except ValueError:
  620             pass
  621 
  622         if md.get('indexing'):
  623             val = local_vars[m.group('in')]
  624             idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion)
  625             return self._index(val, idx), should_return
  626 
  627         for op, _ in self._all_operators():
  628             # hackety: </> have higher priority than <</>>, but don't confuse them
  629             skip_delim = (op + op) if op in '<>*?' else None
  630             if op == '?':
  631                 skip_delim = (skip_delim, '?.')
  632             separated = list(self._separate(expr, op, skip_delims=skip_delim))
  633             if len(separated) < 2:
  634                 continue
  635 
  636             right_expr = separated.pop()
  637             while op == '-' and len(separated) > 1 and not separated[-1].strip():
  638                 right_expr = '-' + right_expr
  639                 separated.pop()
  640             left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion)
  641             return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return
  642 
  643         if md.get('attribute'):
  644             variable, member, nullish = m.group('var', 'member', 'nullish')
  645             if not member:
  646                 member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion)
  647             arg_str = expr[m.end():]
  648             if arg_str.startswith('('):
  649                 arg_str, remaining = self._separate_at_paren(arg_str, ')')
  650             else:
  651                 arg_str, remaining = None, arg_str
  652 
  653             def assertion(cndn, msg):
  654                 """ assert, but without risk of getting optimized out """
  655                 if not cndn:
  656                     raise ExtractorError('{member} {msg}'.format(**locals()), expr=expr)
  657 
  658             def eval_method():
  659                 if (variable, member) == ('console', 'debug'):
  660                     return
  661                 types = {
  662                     'String': compat_str,
  663                     'Math': float,
  664                 }
  665                 obj = local_vars.get(variable)
  666                 if obj in (self.undefined, None):
  667                     obj = types.get(variable, self.undefined)
  668                 if obj is self.undefined:
  669                     try:
  670                         if variable not in self._objects:
  671                             self._objects[variable] = self.extract_object(variable)
  672                         obj = self._objects[variable]
  673                     except self.Exception:
  674                         if not nullish:
  675                             raise
  676 
  677                 if nullish and obj is self.undefined:
  678                     return self.undefined
  679 
  680                 # Member access
  681                 if arg_str is None:
  682                     return self._index(obj, member, nullish)
  683 
  684                 # Function call
  685                 argvals = [
  686                     self.interpret_expression(v, local_vars, allow_recursion)
  687                     for v in self._separate(arg_str)]
  688 
  689                 if obj == compat_str:
  690                     if member == 'fromCharCode':
  691                         assertion(argvals, 'takes one or more arguments')
  692                         return ''.join(map(chr, argvals))
  693                     raise self.Exception('Unsupported string method ' + member, expr=expr)
  694                 elif obj == float:
  695                     if member == 'pow':
  696                         assertion(len(argvals) == 2, 'takes two arguments')
  697                         return argvals[0] ** argvals[1]
  698                     raise self.Exception('Unsupported Math method ' + member, expr=expr)
  699 
  700                 if member == 'split':
  701                     assertion(argvals, 'takes one or more arguments')
  702                     assertion(len(argvals) == 1, 'with limit argument is not implemented')
  703                     return obj.split(argvals[0]) if argvals[0] else list(obj)
  704                 elif member == 'join':
  705                     assertion(isinstance(obj, list), 'must be applied on a list')
  706                     assertion(len(argvals) == 1, 'takes exactly one argument')
  707                     return argvals[0].join(obj)
  708                 elif member == 'reverse':
  709                     assertion(not argvals, 'does not take any arguments')
  710                     obj.reverse()
  711                     return obj
  712                 elif member == 'slice':
  713                     assertion(isinstance(obj, list), 'must be applied on a list')
  714                     assertion(len(argvals) == 1, 'takes exactly one argument')
  715                     return obj[argvals[0]:]
  716                 elif member == 'splice':
  717                     assertion(isinstance(obj, list), 'must be applied on a list')
  718                     assertion(argvals, 'takes one or more arguments')
  719                     index, howMany = map(int, (argvals + [len(obj)])[:2])
  720                     if index < 0:
  721                         index += len(obj)
  722                     add_items = argvals[2:]
  723                     res = []
  724                     for i in range(index, min(index + howMany, len(obj))):
  725                         res.append(obj.pop(index))
  726                     for i, item in enumerate(add_items):
  727                         obj.insert(index + i, item)
  728                     return res
  729                 elif member == 'unshift':
  730                     assertion(isinstance(obj, list), 'must be applied on a list')
  731                     assertion(argvals, 'takes one or more arguments')
  732                     for item in reversed(argvals):
  733                         obj.insert(0, item)
  734                     return obj
  735                 elif member == 'pop':
  736                     assertion(isinstance(obj, list), 'must be applied on a list')
  737                     assertion(not argvals, 'does not take any arguments')
  738                     if not obj:
  739                         return
  740                     return obj.pop()
  741                 elif member == 'push':
  742                     assertion(argvals, 'takes one or more arguments')
  743                     obj.extend(argvals)
  744                     return obj
  745                 elif member == 'forEach':
  746                     assertion(argvals, 'takes one or more arguments')
  747                     assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
  748                     f, this = (argvals + [''])[:2]
  749                     return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)]
  750                 elif member == 'indexOf':
  751                     assertion(argvals, 'takes one or more arguments')
  752                     assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
  753                     idx, start = (argvals + [0])[:2]
  754                     try:
  755                         return obj.index(idx, start)
  756                     except ValueError:
  757                         return -1
  758 
  759                 idx = int(member) if isinstance(obj, list) else member
  760                 return obj[idx](argvals, allow_recursion=allow_recursion)
  761 
  762             if remaining:
  763                 ret, should_abort = self.interpret_statement(
  764                     self._named_object(local_vars, eval_method()) + remaining,
  765                     local_vars, allow_recursion)
  766                 return ret, should_return or should_abort
  767             else:
  768                 return eval_method(), should_return
  769 
  770         elif md.get('function'):
  771             fname = m.group('fname')
  772             argvals = [self.interpret_expression(v, local_vars, allow_recursion)
  773                        for v in self._separate(m.group('args'))]
  774             if fname in local_vars:
  775                 return local_vars[fname](argvals, allow_recursion=allow_recursion), should_return
  776             elif fname not in self._functions:
  777                 self._functions[fname] = self.extract_function(fname)
  778             return self._functions[fname](argvals, allow_recursion=allow_recursion), should_return
  779 
  780         raise self.Exception(
  781             'Unsupported JS expression ' + (expr[:40] if expr != stmt else ''), expr=stmt)
  782 
  783     def interpret_expression(self, expr, local_vars, allow_recursion):
  784         ret, should_return = self.interpret_statement(expr, local_vars, allow_recursion)
  785         if should_return:
  786             raise self.Exception('Cannot return from an expression', expr)
  787         return ret
  788 
  789     def extract_object(self, objname):
  790         _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
  791         obj = {}
  792         obj_m = re.search(
  793             r'''(?x)
  794                 (?<!this\.)%s\s*=\s*{\s*
  795                     (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
  796                 }\s*;
  797             ''' % (re.escape(objname), _FUNC_NAME_RE),
  798             self.code)
  799         if not obj_m:
  800             raise self.Exception('Could not find object ' + objname)
  801         fields = obj_m.group('fields')
  802         # Currently, it only supports function definitions
  803         fields_m = re.finditer(
  804             r'''(?x)
  805                 (?P<key>%s)\s*:\s*function\s*\((?P<args>(?:%s|,)*)\){(?P<code>[^}]+)}
  806             ''' % (_FUNC_NAME_RE, _NAME_RE),
  807             fields)
  808         for f in fields_m:
  809             argnames = self.build_arglist(f.group('args'))
  810             obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
  811 
  812         return obj
  813 
  814     def extract_function_code(self, funcname):
  815         """ @returns argnames, code """
  816         func_m = re.search(
  817             r'''(?xs)
  818                 (?:
  819                     function\s+%(name)s|
  820                     [{;,]\s*%(name)s\s*=\s*function|
  821                     (?:var|const|let)\s+%(name)s\s*=\s*function
  822                 )\s*
  823                 \((?P<args>[^)]*)\)\s*
  824                 (?P<code>{.+})''' % {'name': re.escape(funcname)},
  825             self.code)
  826         code, _ = self._separate_at_paren(func_m.group('code'), '}')  # refine the match
  827         if func_m is None:
  828             raise self.Exception('Could not find JS function "{funcname}"'.format(**locals()))
  829         return self.build_arglist(func_m.group('args')), code
  830 
  831     def extract_function(self, funcname):
  832         return self.extract_function_from_code(*self.extract_function_code(funcname))
  833 
  834     def extract_function_from_code(self, argnames, code, *global_stack):
  835         local_vars = {}
  836         while True:
  837             mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code)
  838             if mobj is None:
  839                 break
  840             start, body_start = mobj.span()
  841             body, remaining = self._separate_at_paren(code[body_start - 1:], '}')
  842             name = self._named_object(
  843                 local_vars,
  844                 self.extract_function_from_code(
  845                     self.build_arglist(mobj.group('args')),
  846                     body, local_vars, *global_stack))
  847             code = code[:start] + name + remaining
  848         return self.build_function(argnames, code, local_vars, *global_stack)
  849 
  850     def call_function(self, funcname, *args):
  851         return self.extract_function(funcname)(args)
  852 
  853     @classmethod
  854     def build_arglist(cls, arg_text):
  855         if not arg_text:
  856             return []
  857 
  858         def valid_arg(y):
  859             y = y.strip()
  860             if not y:
  861                 raise cls.Exception('Missing arg in "%s"' % (arg_text, ))
  862             return y
  863 
  864         return [valid_arg(x) for x in cls._separate(arg_text)]
  865 
  866     def build_function(self, argnames, code, *global_stack):
  867         global_stack = list(global_stack) or [{}]
  868         argnames = tuple(argnames)
  869 
  870         def resf(args, kwargs={}, allow_recursion=100):
  871             global_stack[0].update(
  872                 zip_longest(argnames, args, fillvalue=None))
  873             global_stack[0].update(kwargs)
  874             var_stack = LocalNameSpace(*global_stack)
  875             ret, should_abort = self.interpret_statement(code.replace('\n', ''), var_stack, allow_recursion - 1)
  876             if should_abort:
  877                 return ret
  878         return resf

Generated by cgit