1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32 compat_basestring,
33 compat_cookiejar,
34 compat_get_terminal_size,
35 compat_http_client,
36 compat_kwargs,
37 compat_numeric_types,
38 compat_os_name,
39 compat_str,
40 compat_tokenize_tokenize,
41 compat_urllib_error,
42 compat_urllib_request,
43 compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46 age_restricted,
47 args_to_str,
48 ContentTooShortError,
49 date_from_str,
50 DateRange,
51 DEFAULT_OUTTMPL,
52 determine_ext,
53 determine_protocol,
54 DownloadError,
55 encode_compat_str,
56 encodeFilename,
57 error_to_compat_str,
58 expand_path,
59 ExtractorError,
60 format_bytes,
61 formatSeconds,
62 GeoRestrictedError,
63 int_or_none,
64 ISO3166Utils,
65 locked_file,
66 make_HTTPS_handler,
67 MaxDownloadsReached,
68 orderedSet,
69 PagedList,
70 parse_filesize,
71 PerRequestProxyHandler,
72 platform_name,
73 PostProcessingError,
74 preferredencoding,
75 prepend_extension,
76 process_communicate_or_kill,
77 register_socks_protocols,
78 render_table,
79 replace_extension,
80 SameFileError,
81 sanitize_filename,
82 sanitize_path,
83 sanitize_url,
84 sanitized_Request,
85 std_headers,
86 str_or_none,
87 subtitles_filename,
88 UnavailableVideoError,
89 url_basename,
90 variadic,
91 version_tuple,
92 write_json_file,
93 write_string,
94 YoutubeDLCookieJar,
95 YoutubeDLCookieProcessor,
96 YoutubeDLHandler,
97 YoutubeDLRedirectHandler,
98 )
99 from .cache import Cache
100 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
101 from .extractor.openload import PhantomJSwrapper
102 from .downloader import get_suitable_downloader
103 from .downloader.rtmp import rtmpdump_version
104 from .postprocessor import (
105 FFmpegFixupM3u8PP,
106 FFmpegFixupM4aPP,
107 FFmpegFixupStretchedPP,
108 FFmpegMergerPP,
109 FFmpegPostProcessor,
110 get_postprocessor,
111 )
112 from .version import __version__
113
114 if compat_os_name == 'nt':
115 import ctypes
116
117
118 class YoutubeDL(object):
119 """YoutubeDL class.
120
121 YoutubeDL objects are the ones responsible of downloading the
122 actual video file and writing it to disk if the user has requested
123 it, among some other tasks. In most cases there should be one per
124 program. As, given a video URL, the downloader doesn't know how to
125 extract all the needed information, task that InfoExtractors do, it
126 has to pass the URL to one of them.
127
128 For this, YoutubeDL objects have a method that allows
129 InfoExtractors to be registered in a given order. When it is passed
130 a URL, the YoutubeDL object handles it to the first InfoExtractor it
131 finds that reports being able to handle it. The InfoExtractor extracts
132 all the information about the video or videos the URL refers to, and
133 YoutubeDL process the extracted information, possibly using a File
134 Downloader to download the video.
135
136 YoutubeDL objects accept a lot of parameters. In order not to saturate
137 the object constructor with arguments, it receives a dictionary of
138 options instead. These options are available through the params
139 attribute for the InfoExtractors to use. The YoutubeDL also
140 registers itself as the downloader in charge for the InfoExtractors
141 that are added to it, so this is a "mutual registration".
142
143 Available options:
144
145 username: Username for authentication purposes.
146 password: Password for authentication purposes.
147 videopassword: Password for accessing a video.
148 ap_mso: Adobe Pass multiple-system operator identifier.
149 ap_username: Multiple-system operator account username.
150 ap_password: Multiple-system operator account password.
151 usenetrc: Use netrc for authentication instead.
152 verbose: Print additional info to stdout.
153 quiet: Do not print messages to stdout.
154 no_warnings: Do not print out anything for warnings.
155 forceurl: Force printing final URL.
156 forcetitle: Force printing title.
157 forceid: Force printing ID.
158 forcethumbnail: Force printing thumbnail URL.
159 forcedescription: Force printing description.
160 forcefilename: Force printing final filename.
161 forceduration: Force printing duration.
162 forcejson: Force printing info_dict as JSON.
163 dump_single_json: Force printing the info_dict of the whole playlist
164 (or video) as a single JSON line.
165 simulate: Do not download the video files.
166 format: Video format code. See options.py for more information.
167 outtmpl: Template for output names.
168 outtmpl_na_placeholder: Placeholder for unavailable meta fields.
169 restrictfilenames: Do not allow "&" and spaces in file names
170 ignoreerrors: Do not stop on download errors.
171 force_generic_extractor: Force downloader to use the generic extractor
172 nooverwrites: Prevent overwriting files.
173 playliststart: Playlist item to start at.
174 playlistend: Playlist item to end at.
175 playlist_items: Specific indices of playlist to download.
176 playlistreverse: Download playlist items in reverse order.
177 playlistrandom: Download playlist items in random order.
178 matchtitle: Download only matching titles.
179 rejecttitle: Reject downloads for matching titles.
180 logger: Log messages to a logging.Logger instance.
181 logtostderr: Log messages to stderr instead of stdout.
182 writedescription: Write the video description to a .description file
183 writeinfojson: Write the video description to a .info.json file
184 writeannotations: Write the video annotations to a .annotations.xml file
185 writethumbnail: Write the thumbnail image to a file
186 write_all_thumbnails: Write all thumbnail formats to files
187 writesubtitles: Write the video subtitles to a file
188 writeautomaticsub: Write the automatically generated subtitles to a file
189 allsubtitles: Downloads all the subtitles of the video
190 (requires writesubtitles or writeautomaticsub)
191 listsubtitles: Lists all available subtitles for the video
192 subtitlesformat: The format code for subtitles
193 subtitleslangs: List of languages of the subtitles to download
194 keepvideo: Keep the video file after post-processing
195 daterange: A DateRange object, download only if the upload_date is in the range.
196 skip_download: Skip the actual download of the video file
197 cachedir: Location of the cache files in the filesystem.
198 False to disable filesystem cache.
199 noplaylist: Download single video instead of a playlist if in doubt.
200 age_limit: An integer representing the user's age in years.
201 Unsuitable videos for the given age are skipped.
202 min_views: An integer representing the minimum view count the video
203 must have in order to not be skipped.
204 Videos without view count information are always
205 downloaded. None for no limit.
206 max_views: An integer representing the maximum view count.
207 Videos that are more popular than that are not
208 downloaded.
209 Videos without view count information are always
210 downloaded. None for no limit.
211 download_archive: File name of a file where all downloads are recorded.
212 Videos already present in the file are not downloaded
213 again.
214 cookiefile: File name where cookies should be read from and dumped to.
215 nocheckcertificate:Do not verify SSL certificates
216 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
217 At the moment, this is only supported by YouTube.
218 proxy: URL of the proxy server to use
219 geo_verification_proxy: URL of the proxy to use for IP address verification
220 on geo-restricted sites.
221 socket_timeout: Time to wait for unresponsive hosts, in seconds
222 bidi_workaround: Work around buggy terminals without bidirectional text
223 support, using fridibi
224 debug_printtraffic:Print out sent and received HTTP traffic
225 include_ads: Download ads as well
226 default_search: Prepend this string if an input url is not valid.
227 'auto' for elaborate guessing
228 encoding: Use this encoding instead of the system-specified.
229 extract_flat: Do not resolve URLs, return the immediate result.
230 Pass in 'in_playlist' to only show this behavior for
231 playlist items.
232 postprocessors: A list of dictionaries, each with an entry
233 * key: The name of the postprocessor. See
234 youtube_dl/postprocessor/__init__.py for a list.
235 as well as any further keyword arguments for the
236 postprocessor.
237 progress_hooks: A list of functions that get called on download
238 progress, with a dictionary with the entries
239 * status: One of "downloading", "error", or "finished".
240 Check this first and ignore unknown values.
241
242 If status is one of "downloading", or "finished", the
243 following properties may also be present:
244 * filename: The final filename (always present)
245 * tmpfilename: The filename we're currently writing to
246 * downloaded_bytes: Bytes on disk
247 * total_bytes: Size of the whole file, None if unknown
248 * total_bytes_estimate: Guess of the eventual file size,
249 None if unavailable.
250 * elapsed: The number of seconds since download started.
251 * eta: The estimated time in seconds, None if unknown
252 * speed: The download speed in bytes/second, None if
253 unknown
254 * fragment_index: The counter of the currently
255 downloaded video fragment.
256 * fragment_count: The number of fragments (= individual
257 files that will be merged)
258
259 Progress hooks are guaranteed to be called at least once
260 (with status "finished") if the download is successful.
261 merge_output_format: Extension to use when merging formats.
262 fixup: Automatically correct known faults of the file.
263 One of:
264 - "never": do nothing
265 - "warn": only emit a warning
266 - "detect_or_warn": check whether we can do anything
267 about it, warn otherwise (default)
268 source_address: Client-side IP address to bind to.
269 call_home: Boolean, true iff we are allowed to contact the
270 youtube-dl servers for debugging.
271 sleep_interval: Number of seconds to sleep before each download when
272 used alone or a lower bound of a range for randomized
273 sleep before each download (minimum possible number
274 of seconds to sleep) when used along with
275 max_sleep_interval.
276 max_sleep_interval:Upper bound of a range for randomized sleep before each
277 download (maximum possible number of seconds to sleep).
278 Must only be used along with sleep_interval.
279 Actual sleep time will be a random float from range
280 [sleep_interval; max_sleep_interval].
281 listformats: Print an overview of available video formats and exit.
282 list_thumbnails: Print a table of all thumbnails and exit.
283 match_filter: A function that gets called with the info_dict of
284 every video.
285 If it returns a message, the video is ignored.
286 If it returns None, the video is downloaded.
287 match_filter_func in utils.py is one example for this.
288 no_color: Do not emit color codes in output.
289 geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
290 HTTP header
291 geo_bypass_country:
292 Two-letter ISO 3166-2 country code that will be used for
293 explicit geographic restriction bypassing via faking
294 X-Forwarded-For HTTP header
295 geo_bypass_ip_block:
296 IP range in CIDR notation that will be used similarly to
297 geo_bypass_country
298
299 The following options determine which downloader is picked:
300 external_downloader: Executable of the external downloader to call.
301 None or unset for standard (built-in) downloader.
302 hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
303 if True, otherwise use ffmpeg/avconv if False, otherwise
304 use downloader suggested by extractor if None.
305
306 The following parameters are not used by YoutubeDL itself, they are used by
307 the downloader (see youtube_dl/downloader/common.py):
308 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
309 noresizebuffer, retries, continuedl, noprogress, consoletitle,
310 xattr_set_filesize, external_downloader_args, hls_use_mpegts,
311 http_chunk_size.
312
313 The following options are used by the post processors:
314 prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
315 otherwise prefer ffmpeg.
316 ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
317 to the binary or its containing directory.
318 postprocessor_args: A list of additional command-line arguments for the
319 postprocessor.
320
321 The following options are used by the Youtube extractor:
322 youtube_include_dash_manifest: If True (default), DASH manifests and related
323 data will be downloaded and processed by extractor.
324 You can reduce network I/O by disabling it if you don't
325 care about DASH.
326 """
327
328 _NUMERIC_FIELDS = set((
329 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
330 'timestamp', 'upload_year', 'upload_month', 'upload_day',
331 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
332 'average_rating', 'comment_count', 'age_limit',
333 'start_time', 'end_time',
334 'chapter_number', 'season_number', 'episode_number',
335 'track_number', 'disc_number', 'release_year',
336 'playlist_index',
337 ))
338
339 params = None
340 _ies = []
341 _pps = []
342 _download_retcode = None
343 _num_downloads = None
344 _playlist_level = 0
345 _playlist_urls = set()
346 _screen_file = None
347
348 def __init__(self, params=None, auto_init=True):
349 """Create a FileDownloader object with the given options."""
350 if params is None:
351 params = {}
352 self._ies = []
353 self._ies_instances = {}
354 self._pps = []
355 self._progress_hooks = []
356 self._download_retcode = 0
357 self._num_downloads = 0
358 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
359 self._err_file = sys.stderr
360 self.params = {
361 # Default parameters
362 'nocheckcertificate': False,
363 }
364 self.params.update(params)
365 self.cache = Cache(self)
366
367 def check_deprecated(param, option, suggestion):
368 if self.params.get(param) is not None:
369 self.report_warning(
370 '%s is deprecated. Use %s instead.' % (option, suggestion))
371 return True
372 return False
373
374 if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
375 if self.params.get('geo_verification_proxy') is None:
376 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
377
378 check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
379 check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
380 check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
381
382 if params.get('bidi_workaround', False):
383 try:
384 import pty
385 master, slave = pty.openpty()
386 width = compat_get_terminal_size().columns
387 if width is None:
388 width_args = []
389 else:
390 width_args = ['-w', str(width)]
391 sp_kwargs = dict(
392 stdin=subprocess.PIPE,
393 stdout=slave,
394 stderr=self._err_file)
395 try:
396 self._output_process = subprocess.Popen(
397 ['bidiv'] + width_args, **sp_kwargs
398 )
399 except OSError:
400 self._output_process = subprocess.Popen(
401 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
402 self._output_channel = os.fdopen(master, 'rb')
403 except OSError as ose:
404 if ose.errno == errno.ENOENT:
405 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
406 else:
407 raise
408
409 if (sys.platform != 'win32'
410 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
411 and not params.get('restrictfilenames', False)):
412 # Unicode filesystem API will throw errors (#1474, #13027)
413 self.report_warning(
414 'Assuming --restrict-filenames since file system encoding '
415 'cannot encode all characters. '
416 'Set the LC_ALL environment variable to fix this.')
417 self.params['restrictfilenames'] = True
418
419 if isinstance(params.get('outtmpl'), bytes):
420 self.report_warning(
421 'Parameter outtmpl is bytes, but should be a unicode string. '
422 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
423
424 self._setup_opener()
425
426 if auto_init:
427 self.print_debug_header()
428 self.add_default_info_extractors()
429
430 for pp_def_raw in self.params.get('postprocessors', []):
431 pp_class = get_postprocessor(pp_def_raw['key'])
432 pp_def = dict(pp_def_raw)
433 del pp_def['key']
434 pp = pp_class(self, **compat_kwargs(pp_def))
435 self.add_post_processor(pp)
436
437 for ph in self.params.get('progress_hooks', []):
438 self.add_progress_hook(ph)
439
440 register_socks_protocols()
441
442 def warn_if_short_id(self, argv):
443 # short YouTube ID starting with dash?
444 idxs = [
445 i for i, a in enumerate(argv)
446 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
447 if idxs:
448 correct_argv = (
449 ['youtube-dl']
450 + [a for i, a in enumerate(argv) if i not in idxs]
451 + ['--'] + [argv[i] for i in idxs]
452 )
453 self.report_warning(
454 'Long argument string detected. '
455 'Use -- to separate parameters and URLs, like this:\n%s\n' %
456 args_to_str(correct_argv))
457
458 def add_info_extractor(self, ie):
459 """Add an InfoExtractor object to the end of the list."""
460 self._ies.append(ie)
461 if not isinstance(ie, type):
462 self._ies_instances[ie.ie_key()] = ie
463 ie.set_downloader(self)
464
465 def get_info_extractor(self, ie_key):
466 """
467 Get an instance of an IE with name ie_key, it will try to get one from
468 the _ies list, if there's no instance it will create a new one and add
469 it to the extractor list.
470 """
471 ie = self._ies_instances.get(ie_key)
472 if ie is None:
473 ie = get_info_extractor(ie_key)()
474 self.add_info_extractor(ie)
475 return ie
476
477 def add_default_info_extractors(self):
478 """
479 Add the InfoExtractors returned by gen_extractors to the end of the list
480 """
481 for ie in gen_extractor_classes():
482 self.add_info_extractor(ie)
483
484 def add_post_processor(self, pp):
485 """Add a PostProcessor object to the end of the chain."""
486 self._pps.append(pp)
487 pp.set_downloader(self)
488
489 def add_progress_hook(self, ph):
490 """Add the progress hook (currently only for the file downloader)"""
491 self._progress_hooks.append(ph)
492
493 def _bidi_workaround(self, message):
494 if not hasattr(self, '_output_channel'):
495 return message
496
497 assert hasattr(self, '_output_process')
498 assert isinstance(message, compat_str)
499 line_count = message.count('\n') + 1
500 self._output_process.stdin.write((message + '\n').encode('utf-8'))
501 self._output_process.stdin.flush()
502 res = ''.join(self._output_channel.readline().decode('utf-8')
503 for _ in range(line_count))
504 return res[:-len('\n')]
505
506 def to_screen(self, message, skip_eol=False):
507 """Print message to stdout if not in quiet mode."""
508 return self.to_stdout(message, skip_eol, check_quiet=True)
509
510 def _write_string(self, s, out=None):
511 write_string(s, out=out, encoding=self.params.get('encoding'))
512
513 def to_stdout(self, message, skip_eol=False, check_quiet=False):
514 """Print message to stdout if not in quiet mode."""
515 if self.params.get('logger'):
516 self.params['logger'].debug(message)
517 elif not check_quiet or not self.params.get('quiet', False):
518 message = self._bidi_workaround(message)
519 terminator = ['\n', ''][skip_eol]
520 output = message + terminator
521
522 self._write_string(output, self._screen_file)
523
524 def to_stderr(self, message):
525 """Print message to stderr."""
526 assert isinstance(message, compat_str)
527 if self.params.get('logger'):
528 self.params['logger'].error(message)
529 else:
530 message = self._bidi_workaround(message)
531 output = message + '\n'
532 self._write_string(output, self._err_file)
533
534 def to_console_title(self, message):
535 if not self.params.get('consoletitle', False):
536 return
537 if compat_os_name == 'nt':
538 if ctypes.windll.kernel32.GetConsoleWindow():
539 # c_wchar_p() might not be necessary if `message` is
540 # already of type unicode()
541 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
542 elif 'TERM' in os.environ:
543 self._write_string('\033]0;%s\007' % message, self._screen_file)
544
545 def save_console_title(self):
546 if not self.params.get('consoletitle', False):
547 return
548 if self.params.get('simulate', False):
549 return
550 if compat_os_name != 'nt' and 'TERM' in os.environ:
551 # Save the title on stack
552 self._write_string('\033[22;0t', self._screen_file)
553
554 def restore_console_title(self):
555 if not self.params.get('consoletitle', False):
556 return
557 if self.params.get('simulate', False):
558 return
559 if compat_os_name != 'nt' and 'TERM' in os.environ:
560 # Restore the title from stack
561 self._write_string('\033[23;0t', self._screen_file)
562
563 def __enter__(self):
564 self.save_console_title()
565 return self
566
567 def __exit__(self, *args):
568 self.restore_console_title()
569
570 if self.params.get('cookiefile') is not None:
571 self.cookiejar.save(ignore_discard=True, ignore_expires=True)
572
573 def trouble(self, message=None, tb=None):
574 """Determine action to take when a download problem appears.
575
576 Depending on if the downloader has been configured to ignore
577 download errors or not, this method may throw an exception or
578 not when errors are found, after printing the message.
579
580 tb, if given, is additional traceback information.
581 """
582 if message is not None:
583 self.to_stderr(message)
584 if self.params.get('verbose'):
585 if tb is None:
586 if sys.exc_info()[0]: # if .trouble has been called from an except block
587 tb = ''
588 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
589 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
590 tb += encode_compat_str(traceback.format_exc())
591 else:
592 tb_data = traceback.format_list(traceback.extract_stack())
593 tb = ''.join(tb_data)
594 self.to_stderr(tb)
595 if not self.params.get('ignoreerrors', False):
596 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
597 exc_info = sys.exc_info()[1].exc_info
598 else:
599 exc_info = sys.exc_info()
600 raise DownloadError(message, exc_info)
601 self._download_retcode = 1
602
603 def report_warning(self, message):
604 '''
605 Print the message to stderr, it will be prefixed with 'WARNING:'
606 If stderr is a tty file the 'WARNING:' will be colored
607 '''
608 if self.params.get('logger') is not None:
609 self.params['logger'].warning(message)
610 else:
611 if self.params.get('no_warnings'):
612 return
613 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
614 _msg_header = '\033[0;33mWARNING:\033[0m'
615 else:
616 _msg_header = 'WARNING:'
617 warning_message = '%s %s' % (_msg_header, message)
618 self.to_stderr(warning_message)
619
620 def report_error(self, message, tb=None):
621 '''
622 Do the same as trouble, but prefixes the message with 'ERROR:', colored
623 in red if stderr is a tty file.
624 '''
625 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
626 _msg_header = '\033[0;31mERROR:\033[0m'
627 else:
628 _msg_header = 'ERROR:'
629 error_message = '%s %s' % (_msg_header, message)
630 self.trouble(error_message, tb)
631
632 def report_file_already_downloaded(self, file_name):
633 """Report file has already been fully downloaded."""
634 try:
635 self.to_screen('[download] %s has already been downloaded' % file_name)
636 except UnicodeEncodeError:
637 self.to_screen('[download] The file has already been downloaded')
638
639 def prepare_filename(self, info_dict):
640 """Generate the output filename."""
641 try:
642 template_dict = dict(info_dict)
643
644 template_dict['epoch'] = int(time.time())
645 autonumber_size = self.params.get('autonumber_size')
646 if autonumber_size is None:
647 autonumber_size = 5
648 template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
649 if template_dict.get('resolution') is None:
650 if template_dict.get('width') and template_dict.get('height'):
651 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
652 elif template_dict.get('height'):
653 template_dict['resolution'] = '%sp' % template_dict['height']
654 elif template_dict.get('width'):
655 template_dict['resolution'] = '%dx?' % template_dict['width']
656
657 sanitize = lambda k, v: sanitize_filename(
658 compat_str(v),
659 restricted=self.params.get('restrictfilenames'),
660 is_id=(k == 'id' or k.endswith('_id')))
661 template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
662 for k, v in template_dict.items()
663 if v is not None and not isinstance(v, (list, tuple, dict)))
664 template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict)
665
666 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
667
668 # For fields playlist_index and autonumber convert all occurrences
669 # of %(field)s to %(field)0Nd for backward compatibility
670 field_size_compat_map = {
671 'playlist_index': len(str(template_dict['n_entries'])),
672 'autonumber': autonumber_size,
673 }
674 FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
675 mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
676 if mobj:
677 outtmpl = re.sub(
678 FIELD_SIZE_COMPAT_RE,
679 r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
680 outtmpl)
681
682 # Missing numeric fields used together with integer presentation types
683 # in format specification will break the argument substitution since
684 # string NA placeholder is returned for missing fields. We will patch
685 # output template for missing fields to meet string presentation type.
686 for numeric_field in self._NUMERIC_FIELDS:
687 if numeric_field not in template_dict:
688 # As of [1] format syntax is:
689 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
690 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
691 FORMAT_RE = r'''(?x)
692 (?<!%)
693 %
694 \({0}\) # mapping key
695 (?:[#0\-+ ]+)? # conversion flags (optional)
696 (?:\d+)? # minimum field width (optional)
697 (?:\.\d+)? # precision (optional)
698 [hlL]? # length modifier (optional)
699 [diouxXeEfFgGcrs%] # conversion type
700 '''
701 outtmpl = re.sub(
702 FORMAT_RE.format(numeric_field),
703 r'%({0})s'.format(numeric_field), outtmpl)
704
705 # expand_path translates '%%' into '%' and '$$' into '$'
706 # correspondingly that is not what we want since we need to keep
707 # '%%' intact for template dict substitution step. Working around
708 # with boundary-alike separator hack.
709 sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
710 outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
711
712 # outtmpl should be expand_path'ed before template dict substitution
713 # because meta fields may contain env variables we don't want to
714 # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
715 # title "Hello $PATH", we don't want `$PATH` to be expanded.
716 filename = expand_path(outtmpl).replace(sep, '') % template_dict
717
718 # Temporary fix for #4787
719 # 'Treat' all problem characters by passing filename through preferredencoding
720 # to workaround encoding issues with subprocess on python2 @ Windows
721 if sys.version_info < (3, 0) and sys.platform == 'win32':
722 filename = encodeFilename(filename, True).decode(preferredencoding())
723 return sanitize_path(filename)
724 except ValueError as err:
725 self.report_error('Error in output template: ' + error_to_compat_str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
726 return None
727
728 def _match_entry(self, info_dict, incomplete):
729 """ Returns None iff the file should be downloaded """
730
731 video_title = info_dict.get('title', info_dict.get('id', 'video'))
732 if 'title' in info_dict:
733 # This can happen when we're just evaluating the playlist
734 title = info_dict['title']
735 matchtitle = self.params.get('matchtitle', False)
736 if matchtitle:
737 if not re.search(matchtitle, title, re.IGNORECASE):
738 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
739 rejecttitle = self.params.get('rejecttitle', False)
740 if rejecttitle:
741 if re.search(rejecttitle, title, re.IGNORECASE):
742 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
743 date = info_dict.get('upload_date')
744 if date is not None:
745 dateRange = self.params.get('daterange', DateRange())
746 if date not in dateRange:
747 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
748 view_count = info_dict.get('view_count')
749 if view_count is not None:
750 min_views = self.params.get('min_views')
751 if min_views is not None and view_count < min_views:
752 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
753 max_views = self.params.get('max_views')
754 if max_views is not None and view_count > max_views:
755 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
756 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
757 return 'Skipping "%s" because it is age restricted' % video_title
758 if self.in_download_archive(info_dict):
759 return '%s has already been recorded in archive' % video_title
760
761 if not incomplete:
762 match_filter = self.params.get('match_filter')
763 if match_filter is not None:
764 ret = match_filter(info_dict)
765 if ret is not None:
766 return ret
767
768 return None
769
770 @staticmethod
771 def add_extra_info(info_dict, extra_info):
772 '''Set the keys from extra_info in info dict if they are missing'''
773 for key, value in extra_info.items():
774 info_dict.setdefault(key, value)
775
776 def extract_info(self, url, download=True, ie_key=None, extra_info={},
777 process=True, force_generic_extractor=False):
778 """
779 Return a list with a dictionary for each video extracted.
780
781 Arguments:
782 url -- URL to extract
783
784 Keyword arguments:
785 download -- whether to download videos during extraction
786 ie_key -- extractor key hint
787 extra_info -- dictionary containing the extra values to add to each result
788 process -- whether to resolve all unresolved references (URLs, playlist items),
789 must be True for download to work.
790 force_generic_extractor -- force using the generic extractor
791 """
792
793 if not ie_key and force_generic_extractor:
794 ie_key = 'Generic'
795
796 if ie_key:
797 ies = [self.get_info_extractor(ie_key)]
798 else:
799 ies = self._ies
800
801 for ie in ies:
802 if not ie.suitable(url):
803 continue
804
805 ie = self.get_info_extractor(ie.ie_key())
806 if not ie.working():
807 self.report_warning('The program functionality for this site has been marked as broken, '
808 'and will probably not work.')
809
810 return self.__extract_info(url, ie, download, extra_info, process)
811 else:
812 self.report_error('no suitable InfoExtractor for URL %s' % url)
813
814 def __handle_extraction_exceptions(func):
815 def wrapper(self, *args, **kwargs):
816 try:
817 return func(self, *args, **kwargs)
818 except GeoRestrictedError as e:
819 msg = e.msg
820 if e.countries:
821 msg += '\nThis video is available in %s.' % ', '.join(
822 map(ISO3166Utils.short2full, e.countries))
823 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
824 self.report_error(msg)
825 except ExtractorError as e: # An error we somewhat expected
826 self.report_error(compat_str(e), e.format_traceback())
827 except MaxDownloadsReached:
828 raise
829 except Exception as e:
830 if self.params.get('ignoreerrors', False):
831 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
832 else:
833 raise
834 return wrapper
835
836 @__handle_extraction_exceptions
837 def __extract_info(self, url, ie, download, extra_info, process):
838 ie_result = ie.extract(url)
839 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
840 return
841 if isinstance(ie_result, list):
842 # Backwards compatibility: old IE result format
843 ie_result = {
844 '_type': 'compat_list',
845 'entries': ie_result,
846 }
847 self.add_default_extra_info(ie_result, ie, url)
848 if process:
849 return self.process_ie_result(ie_result, download, extra_info)
850 else:
851 return ie_result
852
853 def add_default_extra_info(self, ie_result, ie, url):
854 self.add_extra_info(ie_result, {
855 'extractor': ie.IE_NAME,
856 'webpage_url': url,
857 'webpage_url_basename': url_basename(url),
858 'extractor_key': ie.ie_key(),
859 })
860
861 def process_ie_result(self, ie_result, download=True, extra_info={}):
862 """
863 Take the result of the ie(may be modified) and resolve all unresolved
864 references (URLs, playlist items).
865
866 It will also download the videos if 'download'.
867 Returns the resolved ie_result.
868 """
869 result_type = ie_result.get('_type', 'video')
870
871 if result_type in ('url', 'url_transparent'):
872 ie_result['url'] = sanitize_url(ie_result['url'])
873 extract_flat = self.params.get('extract_flat', False)
874 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
875 or extract_flat is True):
876 self.__forced_printings(
877 ie_result, self.prepare_filename(ie_result),
878 incomplete=True)
879 return ie_result
880
881 if result_type == 'video':
882 self.add_extra_info(ie_result, extra_info)
883 return self.process_video_result(ie_result, download=download)
884 elif result_type == 'url':
885 # We have to add extra_info to the results because it may be
886 # contained in a playlist
887 return self.extract_info(ie_result['url'],
888 download,
889 ie_key=ie_result.get('ie_key'),
890 extra_info=extra_info)
891 elif result_type == 'url_transparent':
892 # Use the information from the embedding page
893 info = self.extract_info(
894 ie_result['url'], ie_key=ie_result.get('ie_key'),
895 extra_info=extra_info, download=False, process=False)
896
897 # extract_info may return None when ignoreerrors is enabled and
898 # extraction failed with an error, don't crash and return early
899 # in this case
900 if not info:
901 return info
902
903 force_properties = dict(
904 (k, v) for k, v in ie_result.items() if v is not None)
905 for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
906 if f in force_properties:
907 del force_properties[f]
908 new_result = info.copy()
909 new_result.update(force_properties)
910
911 # Extracted info may not be a video result (i.e.
912 # info.get('_type', 'video') != video) but rather an url or
913 # url_transparent. In such cases outer metadata (from ie_result)
914 # should be propagated to inner one (info). For this to happen
915 # _type of info should be overridden with url_transparent. This
916 # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
917 if new_result.get('_type') == 'url':
918 new_result['_type'] = 'url_transparent'
919
920 return self.process_ie_result(
921 new_result, download=download, extra_info=extra_info)
922 elif result_type in ('playlist', 'multi_video'):
923 # Protect from infinite recursion due to recursively nested playlists
924 # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
925 webpage_url = ie_result['webpage_url']
926 if webpage_url in self._playlist_urls:
927 self.to_screen(
928 '[download] Skipping already downloaded playlist: %s'
929 % ie_result.get('title') or ie_result.get('id'))
930 return
931
932 self._playlist_level += 1
933 self._playlist_urls.add(webpage_url)
934 try:
935 return self.__process_playlist(ie_result, download)
936 finally:
937 self._playlist_level -= 1
938 if not self._playlist_level:
939 self._playlist_urls.clear()
940 elif result_type == 'compat_list':
941 self.report_warning(
942 'Extractor %s returned a compat_list result. '
943 'It needs to be updated.' % ie_result.get('extractor'))
944
945 def _fixup(r):
946 self.add_extra_info(
947 r,
948 {
949 'extractor': ie_result['extractor'],
950 'webpage_url': ie_result['webpage_url'],
951 'webpage_url_basename': url_basename(ie_result['webpage_url']),
952 'extractor_key': ie_result['extractor_key'],
953 }
954 )
955 return r
956 ie_result['entries'] = [
957 self.process_ie_result(_fixup(r), download, extra_info)
958 for r in ie_result['entries']
959 ]
960 return ie_result
961 else:
962 raise Exception('Invalid result type: %s' % result_type)
963
964 def __process_playlist(self, ie_result, download):
965 # We process each entry in the playlist
966 playlist = ie_result.get('title') or ie_result.get('id')
967
968 self.to_screen('[download] Downloading playlist: %s' % playlist)
969
970 playlist_results = []
971
972 playliststart = self.params.get('playliststart', 1) - 1
973 playlistend = self.params.get('playlistend')
974 # For backwards compatibility, interpret -1 as whole list
975 if playlistend == -1:
976 playlistend = None
977
978 playlistitems_str = self.params.get('playlist_items')
979 playlistitems = None
980 if playlistitems_str is not None:
981 def iter_playlistitems(format):
982 for string_segment in format.split(','):
983 if '-' in string_segment:
984 start, end = string_segment.split('-')
985 for item in range(int(start), int(end) + 1):
986 yield int(item)
987 else:
988 yield int(string_segment)
989 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
990
991 ie_entries = ie_result['entries']
992
993 def make_playlistitems_entries(list_ie_entries):
994 num_entries = len(list_ie_entries)
995 return [
996 list_ie_entries[i - 1] for i in playlistitems
997 if -num_entries <= i - 1 < num_entries]
998
999 def report_download(num_entries):
1000 self.to_screen(
1001 '[%s] playlist %s: Downloading %d videos' %
1002 (ie_result['extractor'], playlist, num_entries))
1003
1004 if isinstance(ie_entries, list):
1005 n_all_entries = len(ie_entries)
1006 if playlistitems:
1007 entries = make_playlistitems_entries(ie_entries)
1008 else:
1009 entries = ie_entries[playliststart:playlistend]
1010 n_entries = len(entries)
1011 self.to_screen(
1012 '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
1013 (ie_result['extractor'], playlist, n_all_entries, n_entries))
1014 elif isinstance(ie_entries, PagedList):
1015 if playlistitems:
1016 entries = []
1017 for item in playlistitems:
1018 entries.extend(ie_entries.getslice(
1019 item - 1, item
1020 ))
1021 else:
1022 entries = ie_entries.getslice(
1023 playliststart, playlistend)
1024 n_entries = len(entries)
1025 report_download(n_entries)
1026 else: # iterable
1027 if playlistitems:
1028 entries = make_playlistitems_entries(list(itertools.islice(
1029 ie_entries, 0, max(playlistitems))))
1030 else:
1031 entries = list(itertools.islice(
1032 ie_entries, playliststart, playlistend))
1033 n_entries = len(entries)
1034 report_download(n_entries)
1035
1036 if self.params.get('playlistreverse', False):
1037 entries = entries[::-1]
1038
1039 if self.params.get('playlistrandom', False):
1040 random.shuffle(entries)
1041
1042 x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
1043
1044 for i, entry in enumerate(entries, 1):
1045 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
1046 # This __x_forwarded_for_ip thing is a bit ugly but requires
1047 # minimal changes
1048 if x_forwarded_for:
1049 entry['__x_forwarded_for_ip'] = x_forwarded_for
1050 extra = {
1051 'n_entries': n_entries,
1052 'playlist': playlist,
1053 'playlist_id': ie_result.get('id'),
1054 'playlist_title': ie_result.get('title'),
1055 'playlist_uploader': ie_result.get('uploader'),
1056 'playlist_uploader_id': ie_result.get('uploader_id'),
1057 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
1058 'extractor': ie_result['extractor'],
1059 'webpage_url': ie_result['webpage_url'],
1060 'webpage_url_basename': url_basename(ie_result['webpage_url']),
1061 'extractor_key': ie_result['extractor_key'],
1062 }
1063
1064 reason = self._match_entry(entry, incomplete=True)
1065 if reason is not None:
1066 self.to_screen('[download] ' + reason)
1067 continue
1068
1069 entry_result = self.__process_iterable_entry(entry, download, extra)
1070 # TODO: skip failed (empty) entries?
1071 playlist_results.append(entry_result)
1072 ie_result['entries'] = playlist_results
1073 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
1074 return ie_result
1075
1076 @__handle_extraction_exceptions
1077 def __process_iterable_entry(self, entry, download, extra_info):
1078 return self.process_ie_result(
1079 entry, download=download, extra_info=extra_info)
1080
1081 def _build_format_filter(self, filter_spec):
1082 " Returns a function to filter the formats according to the filter_spec "
1083
1084 OPERATORS = {
1085 '<': operator.lt,
1086 '<=': operator.le,
1087 '>': operator.gt,
1088 '>=': operator.ge,
1089 '=': operator.eq,
1090 '!=': operator.ne,
1091 }
1092 operator_rex = re.compile(r'''(?x)\s*
1093 (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
1094 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1095 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1096 $
1097 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1098 m = operator_rex.search(filter_spec)
1099 if m:
1100 try:
1101 comparison_value = int(m.group('value'))
1102 except ValueError:
1103 comparison_value = parse_filesize(m.group('value'))
1104 if comparison_value is None:
1105 comparison_value = parse_filesize(m.group('value') + 'B')
1106 if comparison_value is None:
1107 raise ValueError(
1108 'Invalid value %r in format specification %r' % (
1109 m.group('value'), filter_spec))
1110 op = OPERATORS[m.group('op')]
1111
1112 if not m:
1113 STR_OPERATORS = {
1114 '=': operator.eq,
1115 '^=': lambda attr, value: attr.startswith(value),
1116 '$=': lambda attr, value: attr.endswith(value),
1117 '*=': lambda attr, value: value in attr,
1118 }
1119 str_operator_rex = re.compile(r'''(?x)
1120 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id|language)
1121 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
1122 \s*(?P<value>[a-zA-Z0-9._-]+)
1123 \s*$
1124 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1125 m = str_operator_rex.search(filter_spec)
1126 if m:
1127 comparison_value = m.group('value')
1128 str_op = STR_OPERATORS[m.group('op')]
1129 if m.group('negation'):
1130 op = lambda attr, value: not str_op(attr, value)
1131 else:
1132 op = str_op
1133
1134 if not m:
1135 raise ValueError('Invalid filter specification %r' % filter_spec)
1136
1137 def _filter(f):
1138 actual_value = f.get(m.group('key'))
1139 if actual_value is None:
1140 return m.group('none_inclusive')
1141 return op(actual_value, comparison_value)
1142 return _filter
1143
1144 def _default_format_spec(self, info_dict, download=True):
1145
1146 def can_merge():
1147 merger = FFmpegMergerPP(self)
1148 return merger.available and merger.can_merge()
1149
1150 def prefer_best():
1151 if self.params.get('simulate', False):
1152 return False
1153 if not download:
1154 return False
1155 if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1156 return True
1157 if info_dict.get('is_live'):
1158 return True
1159 if not can_merge():
1160 return True
1161 return False
1162
1163 req_format_list = ['bestvideo+bestaudio', 'best']
1164 if prefer_best():
1165 req_format_list.reverse()
1166 return '/'.join(req_format_list)
1167
1168 def build_format_selector(self, format_spec):
1169 def syntax_error(note, start):
1170 message = (
1171 'Invalid format specification: '
1172 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1173 return SyntaxError(message)
1174
1175 PICKFIRST = 'PICKFIRST'
1176 MERGE = 'MERGE'
1177 SINGLE = 'SINGLE'
1178 GROUP = 'GROUP'
1179 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1180
1181 def _parse_filter(tokens):
1182 filter_parts = []
1183 for type, string, start, _, _ in tokens:
1184 if type == tokenize.OP and string == ']':
1185 return ''.join(filter_parts)
1186 else:
1187 filter_parts.append(string)
1188
1189 def _remove_unused_ops(tokens):
1190 # Remove operators that we don't use and join them with the surrounding strings
1191 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1192 ALLOWED_OPS = ('/', '+', ',', '(', ')')
1193 last_string, last_start, last_end, last_line = None, None, None, None
1194 for type, string, start, end, line in tokens:
1195 if type == tokenize.OP and string == '[':
1196 if last_string:
1197 yield tokenize.NAME, last_string, last_start, last_end, last_line
1198 last_string = None
1199 yield type, string, start, end, line
1200 # everything inside brackets will be handled by _parse_filter
1201 for type, string, start, end, line in tokens:
1202 yield type, string, start, end, line
1203 if type == tokenize.OP and string == ']':
1204 break
1205 elif type == tokenize.OP and string in ALLOWED_OPS:
1206 if last_string:
1207 yield tokenize.NAME, last_string, last_start, last_end, last_line
1208 last_string = None
1209 yield type, string, start, end, line
1210 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1211 if not last_string:
1212 last_string = string
1213 last_start = start
1214 last_end = end
1215 else:
1216 last_string += string
1217 if last_string:
1218 yield tokenize.NAME, last_string, last_start, last_end, last_line
1219
1220 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1221 selectors = []
1222 current_selector = None
1223 for type, string, start, _, _ in tokens:
1224 # ENCODING is only defined in python 3.x
1225 if type == getattr(tokenize, 'ENCODING', None):
1226 continue
1227 elif type in [tokenize.NAME, tokenize.NUMBER]:
1228 current_selector = FormatSelector(SINGLE, string, [])
1229 elif type == tokenize.OP:
1230 if string == ')':
1231 if not inside_group:
1232 # ')' will be handled by the parentheses group
1233 tokens.restore_last_token()
1234 break
1235 elif inside_merge and string in ['/', ',']:
1236 tokens.restore_last_token()
1237 break
1238 elif inside_choice and string == ',':
1239 tokens.restore_last_token()
1240 break
1241 elif string == ',':
1242 if not current_selector:
1243 raise syntax_error('"," must follow a format selector', start)
1244 selectors.append(current_selector)
1245 current_selector = None
1246 elif string == '/':
1247 if not current_selector:
1248 raise syntax_error('"/" must follow a format selector', start)
1249 first_choice = current_selector
1250 second_choice = _parse_format_selection(tokens, inside_choice=True)
1251 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1252 elif string == '[':
1253 if not current_selector:
1254 current_selector = FormatSelector(SINGLE, 'best', [])
1255 format_filter = _parse_filter(tokens)
1256 current_selector.filters.append(format_filter)
1257 elif string == '(':
1258 if current_selector:
1259 raise syntax_error('Unexpected "("', start)
1260 group = _parse_format_selection(tokens, inside_group=True)
1261 current_selector = FormatSelector(GROUP, group, [])
1262 elif string == '+':
1263 if inside_merge:
1264 raise syntax_error('Unexpected "+"', start)
1265 video_selector = current_selector
1266 audio_selector = _parse_format_selection(tokens, inside_merge=True)
1267 if not video_selector or not audio_selector:
1268 raise syntax_error('"+" must be between two format selectors', start)
1269 current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1270 else:
1271 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1272 elif type == tokenize.ENDMARKER:
1273 break
1274 if current_selector:
1275 selectors.append(current_selector)
1276 return selectors
1277
1278 def _build_selector_function(selector):
1279 if isinstance(selector, list):
1280 fs = [_build_selector_function(s) for s in selector]
1281
1282 def selector_function(ctx):
1283 for f in fs:
1284 for format in f(ctx):
1285 yield format
1286 return selector_function
1287 elif selector.type == GROUP:
1288 selector_function = _build_selector_function(selector.selector)
1289 elif selector.type == PICKFIRST:
1290 fs = [_build_selector_function(s) for s in selector.selector]
1291
1292 def selector_function(ctx):
1293 for f in fs:
1294 picked_formats = list(f(ctx))
1295 if picked_formats:
1296 return picked_formats
1297 return []
1298 elif selector.type == SINGLE:
1299 format_spec = selector.selector
1300
1301 def selector_function(ctx):
1302
1303 def best_worst(fmts, fmt_spec='best'):
1304 format_idx = 0 if fmt_spec == 'worst' else -1
1305 audiovideo_formats = [
1306 f for f in fmts
1307 if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1308 if audiovideo_formats:
1309 return audiovideo_formats[format_idx]
1310 # for extractors with incomplete formats (audio only (soundcloud)
1311 # or video only (imgur)) we will fallback to best/worst
1312 # {video,audio}-only format
1313 elif ctx['incomplete_formats']:
1314 return fmts[format_idx]
1315
1316 formats = list(ctx['formats'])
1317 if not formats:
1318 return
1319 if format_spec == 'all':
1320 pass
1321 elif format_spec in ('best', 'worst', None):
1322 formats = best_worst(formats, format_spec)
1323 elif format_spec in ('bestaudio', 'worstaudio'):
1324 audio_formats = [
1325 f for f in formats
1326 if f.get('vcodec') == 'none']
1327 formats = audio_formats[:1] if format_spec == 'worstaudio' else audio_formats[-1:]
1328 elif format_spec in ('bestvideo', 'worstvideo'):
1329 video_formats = [
1330 f for f in formats
1331 if f.get('acodec') == 'none']
1332 formats = video_formats[:1] if format_spec == 'worstvideo' else video_formats[-1:]
1333 else:
1334 extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1335 if format_spec in extensions:
1336 filter_f = lambda f: f['ext'] == format_spec
1337 else:
1338 filter_f = lambda f: f['format_id'] == format_spec
1339 formats = best_worst(list(filter(filter_f, formats)))
1340 for f in variadic(formats or []):
1341 yield f
1342 elif selector.type == MERGE:
1343 def _merge(formats_info):
1344 format_1, format_2 = [f['format_id'] for f in formats_info]
1345 # The first format must contain the video and the
1346 # second the audio
1347 if formats_info[0].get('vcodec') == 'none':
1348 self.report_error('The first format must '
1349 'contain the video, try using '
1350 '"-f %s+%s"' % (format_2, format_1))
1351 return
1352 # Formats must be opposite (video+audio)
1353 if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1354 self.report_error(
1355 'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1356 % (format_1, format_2))
1357 return
1358 output_ext = (
1359 formats_info[0]['ext']
1360 if self.params.get('merge_output_format') is None
1361 else self.params['merge_output_format'])
1362 return {
1363 'requested_formats': formats_info,
1364 'format': '%s+%s' % (formats_info[0].get('format'),
1365 formats_info[1].get('format')),
1366 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1367 formats_info[1].get('format_id')),
1368 'width': formats_info[0].get('width'),
1369 'height': formats_info[0].get('height'),
1370 'resolution': formats_info[0].get('resolution'),
1371 'fps': formats_info[0].get('fps'),
1372 'vcodec': formats_info[0].get('vcodec'),
1373 'vbr': formats_info[0].get('vbr'),
1374 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1375 'acodec': formats_info[1].get('acodec'),
1376 'abr': formats_info[1].get('abr'),
1377 'ext': output_ext,
1378 }
1379 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1380
1381 def selector_function(ctx):
1382 for pair in itertools.product(
1383 video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1384 yield _merge(pair)
1385
1386 filters = [self._build_format_filter(f) for f in selector.filters]
1387
1388 def final_selector(ctx):
1389 ctx_copy = copy.deepcopy(ctx)
1390 for _filter in filters:
1391 ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1392 return selector_function(ctx_copy)
1393 return final_selector
1394
1395 stream = io.BytesIO(format_spec.encode('utf-8'))
1396 try:
1397 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1398 except tokenize.TokenError:
1399 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1400
1401 class TokenIterator(object):
1402 def __init__(self, tokens):
1403 self.tokens = tokens
1404 self.counter = 0
1405
1406 def __iter__(self):
1407 return self
1408
1409 def __next__(self):
1410 if self.counter >= len(self.tokens):
1411 raise StopIteration()
1412 value = self.tokens[self.counter]
1413 self.counter += 1
1414 return value
1415
1416 next = __next__
1417
1418 def restore_last_token(self):
1419 self.counter -= 1
1420
1421 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1422 return _build_selector_function(parsed_selector)
1423
1424 def _calc_headers(self, info_dict):
1425 res = std_headers.copy()
1426
1427 add_headers = info_dict.get('http_headers')
1428 if add_headers:
1429 res.update(add_headers)
1430
1431 cookies = self._calc_cookies(info_dict)
1432 if cookies:
1433 res['Cookie'] = cookies
1434
1435 if 'X-Forwarded-For' not in res:
1436 x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1437 if x_forwarded_for_ip:
1438 res['X-Forwarded-For'] = x_forwarded_for_ip
1439
1440 return res
1441
1442 def _calc_cookies(self, info_dict):
1443 pr = sanitized_Request(info_dict['url'])
1444 self.cookiejar.add_cookie_header(pr)
1445 return pr.get_header('Cookie')
1446
1447 def process_video_result(self, info_dict, download=True):
1448 assert info_dict.get('_type', 'video') == 'video'
1449
1450 if 'id' not in info_dict:
1451 raise ExtractorError('Missing "id" field in extractor result')
1452 if 'title' not in info_dict:
1453 raise ExtractorError('Missing "title" field in extractor result')
1454
1455 def report_force_conversion(field, field_not, conversion):
1456 self.report_warning(
1457 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1458 % (field, field_not, conversion))
1459
1460 def sanitize_string_field(info, string_field):
1461 field = info.get(string_field)
1462 if field is None or isinstance(field, compat_str):
1463 return
1464 report_force_conversion(string_field, 'a string', 'string')
1465 info[string_field] = compat_str(field)
1466
1467 def sanitize_numeric_fields(info):
1468 for numeric_field in self._NUMERIC_FIELDS:
1469 field = info.get(numeric_field)
1470 if field is None or isinstance(field, compat_numeric_types):
1471 continue
1472 report_force_conversion(numeric_field, 'numeric', 'int')
1473 info[numeric_field] = int_or_none(field)
1474
1475 sanitize_string_field(info_dict, 'id')
1476 sanitize_numeric_fields(info_dict)
1477
1478 if 'playlist' not in info_dict:
1479 # It isn't part of a playlist
1480 info_dict['playlist'] = None
1481 info_dict['playlist_index'] = None
1482
1483 thumbnails = info_dict.get('thumbnails')
1484 if thumbnails is None:
1485 thumbnail = info_dict.get('thumbnail')
1486 if thumbnail:
1487 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1488 if thumbnails:
1489 thumbnails.sort(key=lambda t: (
1490 t.get('preference') if t.get('preference') is not None else -1,
1491 t.get('width') if t.get('width') is not None else -1,
1492 t.get('height') if t.get('height') is not None else -1,
1493 t.get('id') if t.get('id') is not None else '', t.get('url')))
1494 for i, t in enumerate(thumbnails):
1495 t['url'] = sanitize_url(t['url'])
1496 if t.get('width') and t.get('height'):
1497 t['resolution'] = '%dx%d' % (t['width'], t['height'])
1498 if t.get('id') is None:
1499 t['id'] = '%d' % i
1500
1501 if self.params.get('list_thumbnails'):
1502 self.list_thumbnails(info_dict)
1503 return
1504
1505 thumbnail = info_dict.get('thumbnail')
1506 if thumbnail:
1507 info_dict['thumbnail'] = sanitize_url(thumbnail)
1508 elif thumbnails:
1509 info_dict['thumbnail'] = thumbnails[-1]['url']
1510
1511 if 'display_id' not in info_dict and 'id' in info_dict:
1512 info_dict['display_id'] = info_dict['id']
1513
1514 for ts_key, date_key in (
1515 ('timestamp', 'upload_date'),
1516 ('release_timestamp', 'release_date'),
1517 ):
1518 if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
1519 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1520 # see http://bugs.python.org/issue1646728)
1521 try:
1522 upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
1523 info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
1524 except (ValueError, OverflowError, OSError):
1525 pass
1526
1527 # Auto generate title fields corresponding to the *_number fields when missing
1528 # in order to always have clean titles. This is very common for TV series.
1529 for field in ('chapter', 'season', 'episode'):
1530 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1531 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1532
1533 for cc_kind in ('subtitles', 'automatic_captions'):
1534 cc = info_dict.get(cc_kind)
1535 if cc:
1536 for _, subtitle in cc.items():
1537 for subtitle_format in subtitle:
1538 if subtitle_format.get('url'):
1539 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1540 if subtitle_format.get('ext') is None:
1541 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1542
1543 automatic_captions = info_dict.get('automatic_captions')
1544 subtitles = info_dict.get('subtitles')
1545
1546 if self.params.get('listsubtitles', False):
1547 if 'automatic_captions' in info_dict:
1548 self.list_subtitles(
1549 info_dict['id'], automatic_captions, 'automatic captions')
1550 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1551 return
1552
1553 info_dict['requested_subtitles'] = self.process_subtitles(
1554 info_dict['id'], subtitles, automatic_captions)
1555
1556 # We now pick which formats have to be downloaded
1557 if info_dict.get('formats') is None:
1558 # There's only one format available
1559 formats = [info_dict]
1560 else:
1561 formats = info_dict['formats']
1562
1563 def is_wellformed(f):
1564 url = f.get('url')
1565 if not url:
1566 self.report_warning(
1567 '"url" field is missing or empty - skipping format, '
1568 'there is an error in extractor')
1569 return False
1570 if isinstance(url, bytes):
1571 sanitize_string_field(f, 'url')
1572 return True
1573
1574 # Filter out malformed formats for better extraction robustness
1575 formats = list(filter(is_wellformed, formats or []))
1576
1577 if not formats:
1578 raise ExtractorError('No video formats found!')
1579
1580 formats_dict = {}
1581
1582 # We check that all the formats have the format and format_id fields
1583 for i, format in enumerate(formats):
1584 sanitize_string_field(format, 'format_id')
1585 sanitize_numeric_fields(format)
1586 format['url'] = sanitize_url(format['url'])
1587 if not format.get('format_id'):
1588 format['format_id'] = compat_str(i)
1589 else:
1590 # Sanitize format_id from characters used in format selector expression
1591 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1592 format_id = format['format_id']
1593 if format_id not in formats_dict:
1594 formats_dict[format_id] = []
1595 formats_dict[format_id].append(format)
1596
1597 # Make sure all formats have unique format_id
1598 for format_id, ambiguous_formats in formats_dict.items():
1599 if len(ambiguous_formats) > 1:
1600 for i, format in enumerate(ambiguous_formats):
1601 format['format_id'] = '%s-%d' % (format_id, i)
1602
1603 for i, format in enumerate(formats):
1604 if format.get('format') is None:
1605 format['format'] = '{id} - {res}{note}'.format(
1606 id=format['format_id'],
1607 res=self.format_resolution(format),
1608 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1609 )
1610 # Automatically determine file extension if missing
1611 if format.get('ext') is None:
1612 format['ext'] = determine_ext(format['url']).lower()
1613 # Automatically determine protocol if missing (useful for format
1614 # selection purposes)
1615 if format.get('protocol') is None:
1616 format['protocol'] = determine_protocol(format)
1617 # Add HTTP headers, so that external programs can use them from the
1618 # json output
1619 full_format_info = info_dict.copy()
1620 full_format_info.update(format)
1621 format['http_headers'] = self._calc_headers(full_format_info)
1622 # Remove private housekeeping stuff
1623 if '__x_forwarded_for_ip' in info_dict:
1624 del info_dict['__x_forwarded_for_ip']
1625
1626 # TODO Central sorting goes here
1627
1628 if formats[0] is not info_dict:
1629 # only set the 'formats' fields if the original info_dict list them
1630 # otherwise we end up with a circular reference, the first (and unique)
1631 # element in the 'formats' field in info_dict is info_dict itself,
1632 # which can't be exported to json
1633 info_dict['formats'] = formats
1634 if self.params.get('listformats'):
1635 self.list_formats(info_dict)
1636 return
1637
1638 req_format = self.params.get('format')
1639 if req_format is None:
1640 req_format = self._default_format_spec(info_dict, download=download)
1641 if self.params.get('verbose'):
1642 self._write_string('[debug] Default format spec: %s\n' % req_format)
1643
1644 format_selector = self.build_format_selector(req_format)
1645
1646 # While in format selection we may need to have an access to the original
1647 # format set in order to calculate some metrics or do some processing.
1648 # For now we need to be able to guess whether original formats provided
1649 # by extractor are incomplete or not (i.e. whether extractor provides only
1650 # video-only or audio-only formats) for proper formats selection for
1651 # extractors with such incomplete formats (see
1652 # https://github.com/ytdl-org/youtube-dl/pull/5556).
1653 # Since formats may be filtered during format selection and may not match
1654 # the original formats the results may be incorrect. Thus original formats
1655 # or pre-calculated metrics should be passed to format selection routines
1656 # as well.
1657 # We will pass a context object containing all necessary additional data
1658 # instead of just formats.
1659 # This fixes incorrect format selection issue (see
1660 # https://github.com/ytdl-org/youtube-dl/issues/10083).
1661 incomplete_formats = (
1662 # All formats are video-only or
1663 all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
1664 # all formats are audio-only
1665 or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1666
1667 ctx = {
1668 'formats': formats,
1669 'incomplete_formats': incomplete_formats,
1670 }
1671
1672 formats_to_download = list(format_selector(ctx))
1673 if not formats_to_download:
1674 raise ExtractorError('requested format not available',
1675 expected=True)
1676
1677 if download:
1678 if len(formats_to_download) > 1:
1679 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1680 for format in formats_to_download:
1681 new_info = dict(info_dict)
1682 new_info.update(format)
1683 self.process_info(new_info)
1684 # We update the info dict with the best quality format (backwards compatibility)
1685 info_dict.update(formats_to_download[-1])
1686 return info_dict
1687
1688 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1689 """Select the requested subtitles and their format"""
1690 available_subs = {}
1691 if normal_subtitles and self.params.get('writesubtitles'):
1692 available_subs.update(normal_subtitles)
1693 if automatic_captions and self.params.get('writeautomaticsub'):
1694 for lang, cap_info in automatic_captions.items():
1695 if lang not in available_subs:
1696 available_subs[lang] = cap_info
1697
1698 if (not self.params.get('writesubtitles') and not
1699 self.params.get('writeautomaticsub') or not
1700 available_subs):
1701 return None
1702
1703 if self.params.get('allsubtitles', False):
1704 requested_langs = available_subs.keys()
1705 else:
1706 if self.params.get('subtitleslangs', False):
1707 requested_langs = self.params.get('subtitleslangs')
1708 elif 'en' in available_subs:
1709 requested_langs = ['en']
1710 else:
1711 requested_langs = [list(available_subs.keys())[0]]
1712
1713 formats_query = self.params.get('subtitlesformat', 'best')
1714 formats_preference = formats_query.split('/') if formats_query else []
1715 subs = {}
1716 for lang in requested_langs:
1717 formats = available_subs.get(lang)
1718 if formats is None:
1719 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1720 continue
1721 for ext in formats_preference:
1722 if ext == 'best':
1723 f = formats[-1]
1724 break
1725 matches = list(filter(lambda f: f['ext'] == ext, formats))
1726 if matches:
1727 f = matches[-1]
1728 break
1729 else:
1730 f = formats[-1]
1731 self.report_warning(
1732 'No subtitle format found matching "%s" for language %s, '
1733 'using %s' % (formats_query, lang, f['ext']))
1734 subs[lang] = f
1735 return subs
1736
1737 def __forced_printings(self, info_dict, filename, incomplete):
1738 def print_mandatory(field):
1739 if (self.params.get('force%s' % field, False)
1740 and (not incomplete or info_dict.get(field) is not None)):
1741 self.to_stdout(info_dict[field])
1742
1743 def print_optional(field):
1744 if (self.params.get('force%s' % field, False)
1745 and info_dict.get(field) is not None):
1746 self.to_stdout(info_dict[field])
1747
1748 print_mandatory('title')
1749 print_mandatory('id')
1750 if self.params.get('forceurl', False) and not incomplete:
1751 if info_dict.get('requested_formats') is not None:
1752 for f in info_dict['requested_formats']:
1753 self.to_stdout(f['url'] + f.get('play_path', ''))
1754 else:
1755 # For RTMP URLs, also include the playpath
1756 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1757 print_optional('thumbnail')
1758 print_optional('description')
1759 if self.params.get('forcefilename', False) and filename is not None:
1760 self.to_stdout(filename)
1761 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1762 self.to_stdout(formatSeconds(info_dict['duration']))
1763 print_mandatory('format')
1764 if self.params.get('forcejson', False):
1765 self.to_stdout(json.dumps(info_dict))
1766
1767 def process_info(self, info_dict):
1768 """Process a single resolved IE result."""
1769
1770 assert info_dict.get('_type', 'video') == 'video'
1771
1772 max_downloads = int_or_none(self.params.get('max_downloads')) or float('inf')
1773 if self._num_downloads >= max_downloads:
1774 raise MaxDownloadsReached()
1775
1776 # TODO: backward compatibility, to be removed
1777 info_dict['fulltitle'] = info_dict['title']
1778
1779 if 'format' not in info_dict:
1780 info_dict['format'] = info_dict['ext']
1781
1782 reason = self._match_entry(info_dict, incomplete=False)
1783 if reason is not None:
1784 self.to_screen('[download] ' + reason)
1785 return
1786
1787 self._num_downloads += 1
1788
1789 info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1790
1791 # Forced printings
1792 self.__forced_printings(info_dict, filename, incomplete=False)
1793
1794 # Do nothing else if in simulate mode
1795 if self.params.get('simulate', False):
1796 return
1797
1798 if filename is None:
1799 return
1800
1801 def ensure_dir_exists(path):
1802 try:
1803 dn = os.path.dirname(path)
1804 if dn and not os.path.exists(dn):
1805 os.makedirs(dn)
1806 return True
1807 except (OSError, IOError) as err:
1808 if isinstance(err, OSError) and err.errno == errno.EEXIST:
1809 return True
1810 self.report_error('unable to create directory ' + error_to_compat_str(err))
1811 return False
1812
1813 if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1814 return
1815
1816 if self.params.get('writedescription', False):
1817 descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1818 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1819 self.to_screen('[info] Video description is already present')
1820 elif info_dict.get('description') is None:
1821 self.report_warning('There\'s no description to write.')
1822 else:
1823 try:
1824 self.to_screen('[info] Writing video description to: ' + descfn)
1825 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1826 descfile.write(info_dict['description'])
1827 except (OSError, IOError):
1828 self.report_error('Cannot write description file ' + descfn)
1829 return
1830
1831 if self.params.get('writeannotations', False):
1832 annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1833 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1834 self.to_screen('[info] Video annotations are already present')
1835 elif not info_dict.get('annotations'):
1836 self.report_warning('There are no annotations to write.')
1837 else:
1838 try:
1839 self.to_screen('[info] Writing video annotations to: ' + annofn)
1840 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1841 annofile.write(info_dict['annotations'])
1842 except (KeyError, TypeError):
1843 self.report_warning('There are no annotations to write.')
1844 except (OSError, IOError):
1845 self.report_error('Cannot write annotations file: ' + annofn)
1846 return
1847
1848 subtitles_are_requested = any([self.params.get('writesubtitles', False),
1849 self.params.get('writeautomaticsub')])
1850
1851 if subtitles_are_requested and info_dict.get('requested_subtitles'):
1852 # subtitles download errors are already managed as troubles in relevant IE
1853 # that way it will silently go on when used with unsupporting IE
1854 subtitles = info_dict['requested_subtitles']
1855 ie = self.get_info_extractor(info_dict['extractor_key'])
1856 for sub_lang, sub_info in subtitles.items():
1857 sub_format = sub_info['ext']
1858 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
1859 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1860 self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1861 else:
1862 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1863 if sub_info.get('data') is not None:
1864 try:
1865 # Use newline='' to prevent conversion of newline characters
1866 # See https://github.com/ytdl-org/youtube-dl/issues/10268
1867 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1868 subfile.write(sub_info['data'])
1869 except (OSError, IOError):
1870 self.report_error('Cannot write subtitles file ' + sub_filename)
1871 return
1872 else:
1873 try:
1874 sub_data = ie._request_webpage(
1875 sub_info['url'], info_dict['id'], note=False).read()
1876 with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1877 subfile.write(sub_data)
1878 except (ExtractorError, IOError, OSError, ValueError) as err:
1879 self.report_warning('Unable to download subtitle for "%s": %s' %
1880 (sub_lang, error_to_compat_str(err)))
1881 continue
1882
1883 if self.params.get('writeinfojson', False):
1884 infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1885 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1886 self.to_screen('[info] Video description metadata is already present')
1887 else:
1888 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1889 try:
1890 write_json_file(self.filter_requested_info(info_dict), infofn)
1891 except (OSError, IOError):
1892 self.report_error('Cannot write metadata to JSON file ' + infofn)
1893 return
1894
1895 self._write_thumbnails(info_dict, filename)
1896
1897 if not self.params.get('skip_download', False):
1898 try:
1899 def checked_get_suitable_downloader(info_dict, params):
1900 ed_args = params.get('external_downloader_args')
1901 dler = get_suitable_downloader(info_dict, params)
1902 if ed_args and not params.get('external_downloader_args'):
1903 # external_downloader_args was cleared because external_downloader was rejected
1904 self.report_warning('Requested external downloader cannot be used: '
1905 'ignoring --external-downloader-args.')
1906 return dler
1907
1908 def dl(name, info):
1909 fd = checked_get_suitable_downloader(info, self.params)(self, self.params)
1910 for ph in self._progress_hooks:
1911 fd.add_progress_hook(ph)
1912 if self.params.get('verbose'):
1913 self.to_screen('[debug] Invoking downloader on %r' % info.get('url'))
1914 return fd.download(name, info)
1915
1916 if info_dict.get('requested_formats') is not None:
1917 downloaded = []
1918 success = True
1919 merger = FFmpegMergerPP(self)
1920 if not merger.available:
1921 postprocessors = []
1922 self.report_warning('You have requested multiple '
1923 'formats but ffmpeg or avconv are not installed.'
1924 ' The formats won\'t be merged.')
1925 else:
1926 postprocessors = [merger]
1927
1928 def compatible_formats(formats):
1929 video, audio = formats
1930 # Check extension
1931 video_ext, audio_ext = video.get('ext'), audio.get('ext')
1932 if video_ext and audio_ext:
1933 COMPATIBLE_EXTS = (
1934 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1935 ('webm')
1936 )
1937 for exts in COMPATIBLE_EXTS:
1938 if video_ext in exts and audio_ext in exts:
1939 return True
1940 # TODO: Check acodec/vcodec
1941 return False
1942
1943 filename_real_ext = os.path.splitext(filename)[1][1:]
1944 filename_wo_ext = (
1945 os.path.splitext(filename)[0]
1946 if filename_real_ext == info_dict['ext']
1947 else filename)
1948 requested_formats = info_dict['requested_formats']
1949 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1950 info_dict['ext'] = 'mkv'
1951 self.report_warning(
1952 'Requested formats are incompatible for merge and will be merged into mkv.')
1953 # Ensure filename always has a correct extension for successful merge
1954 filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1955 if os.path.exists(encodeFilename(filename)):
1956 self.to_screen(
1957 '[download] %s has already been downloaded and '
1958 'merged' % filename)
1959 else:
1960 for f in requested_formats:
1961 new_info = dict(info_dict)
1962 new_info.update(f)
1963 fname = prepend_extension(
1964 self.prepare_filename(new_info),
1965 'f%s' % f['format_id'], new_info['ext'])
1966 if not ensure_dir_exists(fname):
1967 return
1968 downloaded.append(fname)
1969 partial_success = dl(fname, new_info)
1970 success = success and partial_success
1971 info_dict['__postprocessors'] = postprocessors
1972 info_dict['__files_to_merge'] = downloaded
1973 else:
1974 # Just a single file
1975 success = dl(filename, info_dict)
1976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1977 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1978 return
1979 except (OSError, IOError) as err:
1980 raise UnavailableVideoError(err)
1981 except (ContentTooShortError, ) as err:
1982 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1983 return
1984
1985 if success and filename != '-':
1986 # Fixup content
1987 fixup_policy = self.params.get('fixup')
1988 if fixup_policy is None:
1989 fixup_policy = 'detect_or_warn'
1990
1991 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1992
1993 stretched_ratio = info_dict.get('stretched_ratio')
1994 if stretched_ratio is not None and stretched_ratio != 1:
1995 if fixup_policy == 'warn':
1996 self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1997 info_dict['id'], stretched_ratio))
1998 elif fixup_policy == 'detect_or_warn':
1999 stretched_pp = FFmpegFixupStretchedPP(self)
2000 if stretched_pp.available:
2001 info_dict.setdefault('__postprocessors', [])
2002 info_dict['__postprocessors'].append(stretched_pp)
2003 else:
2004 self.report_warning(
2005 '%s: Non-uniform pixel ratio (%s). %s'
2006 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
2007 else:
2008 assert fixup_policy in ('ignore', 'never')
2009
2010 if (info_dict.get('requested_formats') is None
2011 and info_dict.get('container') == 'm4a_dash'):
2012 if fixup_policy == 'warn':
2013 self.report_warning(
2014 '%s: writing DASH m4a. '
2015 'Only some players support this container.'
2016 % info_dict['id'])
2017 elif fixup_policy == 'detect_or_warn':
2018 fixup_pp = FFmpegFixupM4aPP(self)
2019 if fixup_pp.available:
2020 info_dict.setdefault('__postprocessors', [])
2021 info_dict['__postprocessors'].append(fixup_pp)
2022 else:
2023 self.report_warning(
2024 '%s: writing DASH m4a. '
2025 'Only some players support this container. %s'
2026 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
2027 else:
2028 assert fixup_policy in ('ignore', 'never')
2029
2030 if (info_dict.get('protocol') == 'm3u8_native'
2031 or info_dict.get('protocol') == 'm3u8'
2032 and self.params.get('hls_prefer_native')):
2033 if fixup_policy == 'warn':
2034 self.report_warning('%s: malformed AAC bitstream detected.' % (
2035 info_dict['id']))
2036 elif fixup_policy == 'detect_or_warn':
2037 fixup_pp = FFmpegFixupM3u8PP(self)
2038 if fixup_pp.available:
2039 info_dict.setdefault('__postprocessors', [])
2040 info_dict['__postprocessors'].append(fixup_pp)
2041 else:
2042 self.report_warning(
2043 '%s: malformed AAC bitstream detected. %s'
2044 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
2045 else:
2046 assert fixup_policy in ('ignore', 'never')
2047
2048 try:
2049 self.post_process(filename, info_dict)
2050 except (PostProcessingError) as err:
2051 self.report_error('postprocessing: %s' % error_to_compat_str(err))
2052 return
2053 self.record_download_archive(info_dict)
2054 # avoid possible nugatory search for further items (PR #26638)
2055 if self._num_downloads >= max_downloads:
2056 raise MaxDownloadsReached()
2057
2058 def download(self, url_list):
2059 """Download a given list of URLs."""
2060 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
2061 if (len(url_list) > 1
2062 and outtmpl != '-'
2063 and '%' not in outtmpl
2064 and self.params.get('max_downloads') != 1):
2065 raise SameFileError(outtmpl)
2066
2067 for url in url_list:
2068 try:
2069 # It also downloads the videos
2070 res = self.extract_info(
2071 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
2072 except UnavailableVideoError:
2073 self.report_error('unable to download video')
2074 except MaxDownloadsReached:
2075 self.to_screen('[info] Maximum number of downloaded files reached.')
2076 raise
2077 else:
2078 if self.params.get('dump_single_json', False):
2079 self.to_stdout(json.dumps(res))
2080
2081 return self._download_retcode
2082
2083 def download_with_info_file(self, info_filename):
2084 with contextlib.closing(fileinput.FileInput(
2085 [info_filename], mode='r',
2086 openhook=fileinput.hook_encoded('utf-8'))) as f:
2087 # FileInput doesn't have a read method, we can't call json.load
2088 info = self.filter_requested_info(json.loads('\n'.join(f)))
2089 try:
2090 self.process_ie_result(info, download=True)
2091 except DownloadError:
2092 webpage_url = info.get('webpage_url')
2093 if webpage_url is not None:
2094 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2095 return self.download([webpage_url])
2096 else:
2097 raise
2098 return self._download_retcode
2099
2100 @staticmethod
2101 def filter_requested_info(info_dict):
2102 return dict(
2103 (k, v) for k, v in info_dict.items()
2104 if k not in ['requested_formats', 'requested_subtitles'])
2105
2106 def post_process(self, filename, ie_info):
2107 """Run all the postprocessors on the given file."""
2108 info = dict(ie_info)
2109 info['filepath'] = filename
2110 pps_chain = []
2111 if ie_info.get('__postprocessors') is not None:
2112 pps_chain.extend(ie_info['__postprocessors'])
2113 pps_chain.extend(self._pps)
2114 for pp in pps_chain:
2115 files_to_delete = []
2116 try:
2117 files_to_delete, info = pp.run(info)
2118 except PostProcessingError as e:
2119 self.report_error(e.msg)
2120 if files_to_delete and not self.params.get('keepvideo', False):
2121 for old_filename in files_to_delete:
2122 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2123 try:
2124 os.remove(encodeFilename(old_filename))
2125 except (IOError, OSError):
2126 self.report_warning('Unable to remove downloaded original file')
2127
2128 def _make_archive_id(self, info_dict):
2129 video_id = info_dict.get('id')
2130 if not video_id:
2131 return
2132 # Future-proof against any change in case
2133 # and backwards compatibility with prior versions
2134 extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
2135 if extractor is None:
2136 url = str_or_none(info_dict.get('url'))
2137 if not url:
2138 return
2139 # Try to find matching extractor for the URL and take its ie_key
2140 for ie in self._ies:
2141 if ie.suitable(url):
2142 extractor = ie.ie_key()
2143 break
2144 else:
2145 return
2146 return extractor.lower() + ' ' + video_id
2147
2148 def in_download_archive(self, info_dict):
2149 fn = self.params.get('download_archive')
2150 if fn is None:
2151 return False
2152
2153 vid_id = self._make_archive_id(info_dict)
2154 if not vid_id:
2155 return False # Incomplete video information
2156
2157 try:
2158 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2159 for line in archive_file:
2160 if line.strip() == vid_id:
2161 return True
2162 except IOError as ioe:
2163 if ioe.errno != errno.ENOENT:
2164 raise
2165 return False
2166
2167 def record_download_archive(self, info_dict):
2168 fn = self.params.get('download_archive')
2169 if fn is None:
2170 return
2171 vid_id = self._make_archive_id(info_dict)
2172 assert vid_id
2173 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2174 archive_file.write(vid_id + '\n')
2175
2176 @staticmethod
2177 def format_resolution(format, default='unknown'):
2178 if format.get('vcodec') == 'none':
2179 return 'audio only'
2180 if format.get('resolution') is not None:
2181 return format['resolution']
2182 if format.get('height') is not None:
2183 if format.get('width') is not None:
2184 res = '%sx%s' % (format['width'], format['height'])
2185 else:
2186 res = '%sp' % format['height']
2187 elif format.get('width') is not None:
2188 res = '%dx?' % format['width']
2189 else:
2190 res = default
2191 return res
2192
2193 def _format_note(self, fdict):
2194 res = ''
2195 if fdict.get('ext') in ['f4f', 'f4m']:
2196 res += '(unsupported) '
2197 if fdict.get('language'):
2198 if res:
2199 res += ' '
2200 res += '[%s] ' % fdict['language']
2201 if fdict.get('format_note') is not None:
2202 res += fdict['format_note'] + ' '
2203 if fdict.get('tbr') is not None:
2204 res += '%4dk ' % fdict['tbr']
2205 if fdict.get('container') is not None:
2206 if res:
2207 res += ', '
2208 res += '%s container' % fdict['container']
2209 if (fdict.get('vcodec') is not None
2210 and fdict.get('vcodec') != 'none'):
2211 if res:
2212 res += ', '
2213 res += fdict['vcodec']
2214 if fdict.get('vbr') is not None:
2215 res += '@'
2216 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2217 res += 'video@'
2218 if fdict.get('vbr') is not None:
2219 res += '%4dk' % fdict['vbr']
2220 if fdict.get('fps') is not None:
2221 if res:
2222 res += ', '
2223 res += '%sfps' % fdict['fps']
2224 if fdict.get('acodec') is not None:
2225 if res:
2226 res += ', '
2227 if fdict['acodec'] == 'none':
2228 res += 'video only'
2229 else:
2230 res += '%-5s' % fdict['acodec']
2231 elif fdict.get('abr') is not None:
2232 if res:
2233 res += ', '
2234 res += 'audio'
2235 if fdict.get('abr') is not None:
2236 res += '@%3dk' % fdict['abr']
2237 if fdict.get('asr') is not None:
2238 res += ' (%5dHz)' % fdict['asr']
2239 if fdict.get('filesize') is not None:
2240 if res:
2241 res += ', '
2242 res += format_bytes(fdict['filesize'])
2243 elif fdict.get('filesize_approx') is not None:
2244 if res:
2245 res += ', '
2246 res += '~' + format_bytes(fdict['filesize_approx'])
2247 return res
2248
2249 def list_formats(self, info_dict):
2250 formats = info_dict.get('formats', [info_dict])
2251 table = [
2252 [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2253 for f in formats
2254 if f.get('preference') is None or f['preference'] >= -1000]
2255 if len(formats) > 1:
2256 table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2257
2258 header_line = ['format code', 'extension', 'resolution', 'note']
2259 self.to_screen(
2260 '[info] Available formats for %s:\n%s' %
2261 (info_dict['id'], render_table(header_line, table)))
2262
2263 def list_thumbnails(self, info_dict):
2264 thumbnails = info_dict.get('thumbnails')
2265 if not thumbnails:
2266 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2267 return
2268
2269 self.to_screen(
2270 '[info] Thumbnails for %s:' % info_dict['id'])
2271 self.to_screen(render_table(
2272 ['ID', 'width', 'height', 'URL'],
2273 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2274
2275 def list_subtitles(self, video_id, subtitles, name='subtitles'):
2276 if not subtitles:
2277 self.to_screen('%s has no %s' % (video_id, name))
2278 return
2279 self.to_screen(
2280 'Available %s for %s:' % (name, video_id))
2281 self.to_screen(render_table(
2282 ['Language', 'formats'],
2283 [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2284 for lang, formats in subtitles.items()]))
2285
2286 def urlopen(self, req):
2287 """ Start an HTTP download """
2288 if isinstance(req, compat_basestring):
2289 req = sanitized_Request(req)
2290 return self._opener.open(req, timeout=self._socket_timeout)
2291
2292 def print_debug_header(self):
2293 if not self.params.get('verbose'):
2294 return
2295
2296 if type('') is not compat_str:
2297 # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
2298 self.report_warning(
2299 'Your Python is broken! Update to a newer and supported version')
2300
2301 stdout_encoding = getattr(
2302 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2303 encoding_str = (
2304 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2305 locale.getpreferredencoding(),
2306 sys.getfilesystemencoding(),
2307 stdout_encoding,
2308 self.get_encoding()))
2309 write_string(encoding_str, encoding=None)
2310
2311 self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2312 if _LAZY_LOADER:
2313 self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2314 try:
2315 sp = subprocess.Popen(
2316 ['git', 'rev-parse', '--short', 'HEAD'],
2317 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2318 cwd=os.path.dirname(os.path.abspath(__file__)))
2319 out, err = process_communicate_or_kill(sp)
2320 out = out.decode().strip()
2321 if re.match('[0-9a-f]+', out):
2322 self._write_string('[debug] Git HEAD: ' + out + '\n')
2323 except Exception:
2324 try:
2325 sys.exc_clear()
2326 except Exception:
2327 pass
2328
2329 def python_implementation():
2330 impl_name = platform.python_implementation()
2331 if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2332 return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
2333 return impl_name
2334
2335 self._write_string('[debug] Python version %s (%s) - %s\n' % (
2336 platform.python_version(), python_implementation(),
2337 platform_name()))
2338
2339 exe_versions = FFmpegPostProcessor.get_versions(self)
2340 exe_versions['rtmpdump'] = rtmpdump_version()
2341 exe_versions['phantomjs'] = PhantomJSwrapper._version()
2342 exe_str = ', '.join(
2343 '%s %s' % (exe, v)
2344 for exe, v in sorted(exe_versions.items())
2345 if v
2346 )
2347 if not exe_str:
2348 exe_str = 'none'
2349 self._write_string('[debug] exe versions: %s\n' % exe_str)
2350
2351 proxy_map = {}
2352 for handler in self._opener.handlers:
2353 if hasattr(handler, 'proxies'):
2354 proxy_map.update(handler.proxies)
2355 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2356
2357 if self.params.get('call_home', False):
2358 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2359 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2360 latest_version = self.urlopen(
2361 'https://yt-dl.org/latest/version').read().decode('utf-8')
2362 if version_tuple(latest_version) > version_tuple(__version__):
2363 self.report_warning(
2364 'You are using an outdated version (newest version: %s)! '
2365 'See https://yt-dl.org/update if you need help updating.' %
2366 latest_version)
2367
2368 def _setup_opener(self):
2369 timeout_val = self.params.get('socket_timeout')
2370 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2371
2372 opts_cookiefile = self.params.get('cookiefile')
2373 opts_proxy = self.params.get('proxy')
2374
2375 if opts_cookiefile is None:
2376 self.cookiejar = compat_cookiejar.CookieJar()
2377 else:
2378 opts_cookiefile = expand_path(opts_cookiefile)
2379 self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
2380 if os.access(opts_cookiefile, os.R_OK):
2381 self.cookiejar.load(ignore_discard=True, ignore_expires=True)
2382
2383 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2384 if opts_proxy is not None:
2385 if opts_proxy == '':
2386 proxies = {}
2387 else:
2388 proxies = {'http': opts_proxy, 'https': opts_proxy}
2389 else:
2390 proxies = compat_urllib_request.getproxies()
2391 # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
2392 if 'http' in proxies and 'https' not in proxies:
2393 proxies['https'] = proxies['http']
2394 proxy_handler = PerRequestProxyHandler(proxies)
2395
2396 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2397 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2398 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2399 redirect_handler = YoutubeDLRedirectHandler()
2400 data_handler = compat_urllib_request_DataHandler()
2401
2402 # When passing our own FileHandler instance, build_opener won't add the
2403 # default FileHandler and allows us to disable the file protocol, which
2404 # can be used for malicious purposes (see
2405 # https://github.com/ytdl-org/youtube-dl/issues/8227)
2406 file_handler = compat_urllib_request.FileHandler()
2407
2408 def file_open(*args, **kwargs):
2409 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2410 file_handler.file_open = file_open
2411
2412 opener = compat_urllib_request.build_opener(
2413 proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
2414
2415 # Delete the default user-agent header, which would otherwise apply in
2416 # cases where our custom HTTP handler doesn't come into play
2417 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
2418 opener.addheaders = []
2419 self._opener = opener
2420
2421 def encode(self, s):
2422 if isinstance(s, bytes):
2423 return s # Already encoded
2424
2425 try:
2426 return s.encode(self.get_encoding())
2427 except UnicodeEncodeError as err:
2428 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2429 raise
2430
2431 def get_encoding(self):
2432 encoding = self.params.get('encoding')
2433 if encoding is None:
2434 encoding = preferredencoding()
2435 return encoding
2436
2437 def _write_thumbnails(self, info_dict, filename):
2438 if self.params.get('writethumbnail', False):
2439 thumbnails = info_dict.get('thumbnails')
2440 if thumbnails:
2441 thumbnails = [thumbnails[-1]]
2442 elif self.params.get('write_all_thumbnails', False):
2443 thumbnails = info_dict.get('thumbnails')
2444 else:
2445 return
2446
2447 if not thumbnails:
2448 # No thumbnails present, so return immediately
2449 return
2450
2451 for t in thumbnails:
2452 thumb_ext = determine_ext(t['url'], 'jpg')
2453 suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2454 thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2455 t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext'))
2456
2457 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2458 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2459 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2460 else:
2461 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2462 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2463 try:
2464 uf = self.urlopen(t['url'])
2465 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2466 shutil.copyfileobj(uf, thumbf)
2467 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2468 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470 self.report_warning('Unable to download thumbnail "%s": %s' %
2471 (t['url'], error_to_compat_str(err)))
|