1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import ssl
14 import sys
15 import time
16 import math
17
18 from ..compat import (
19 compat_cookiejar_Cookie,
20 compat_cookies_SimpleCookie,
21 compat_etree_Element,
22 compat_etree_fromstring,
23 compat_getpass,
24 compat_integer_types,
25 compat_http_client,
26 compat_os_name,
27 compat_str,
28 compat_urllib_error,
29 compat_urllib_parse_unquote,
30 compat_urllib_parse_urlencode,
31 compat_urllib_request,
32 compat_urlparse,
33 compat_xml_parse_error,
34 )
35 from ..downloader.f4m import (
36 get_base_url,
37 remove_encrypted_media,
38 )
39 from ..utils import (
40 NO_DEFAULT,
41 age_restricted,
42 base_url,
43 bug_reports_message,
44 clean_html,
45 compiled_regex_type,
46 determine_ext,
47 determine_protocol,
48 dict_get,
49 error_to_compat_str,
50 ExtractorError,
51 extract_attributes,
52 fix_xml_ampersands,
53 float_or_none,
54 GeoRestrictedError,
55 GeoUtils,
56 int_or_none,
57 js_to_json,
58 JSON_LD_RE,
59 mimetype2ext,
60 orderedSet,
61 parse_bitrate,
62 parse_codecs,
63 parse_duration,
64 parse_iso8601,
65 parse_m3u8_attributes,
66 parse_resolution,
67 RegexNotFoundError,
68 sanitized_Request,
69 sanitize_filename,
70 str_or_none,
71 str_to_int,
72 strip_or_none,
73 try_get,
74 unescapeHTML,
75 unified_strdate,
76 unified_timestamp,
77 update_Request,
78 update_url_query,
79 urljoin,
80 url_basename,
81 url_or_none,
82 xpath_element,
83 xpath_text,
84 xpath_with_ns,
85 )
86
87
88 class InfoExtractor(object):
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
95 passed to the YoutubeDL. The YoutubeDL processes this
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
99 The type field determines the type of the result.
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
104
105 id: Video identifier.
106 title: Video title, unescaped.
107
108 Additionally, it must contain either a formats entry or a url one:
109
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
112
113 Potential fields:
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
116 for RTMP - RTMP URL,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
119 for DASH
120 - HTTP URL to plain file media (in case of
121 unfragmented media)
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
124 is parsed from a string (in case of
125 fragmented media)
126 for MSS - URL of the ISM manifest.
127 * manifest_url
128 The URL of the manifest file in case of
129 fragmented media:
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
134 * ext Will be calculated from URL if missing
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
146 * resolution Textual description of width and height
147 * tbr Average bitrate of audio and video in KBit/s
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
150 * asr Audio sampling rate in Hertz
151 * vbr Average video bitrate in KBit/s
152 * fps Frame rate
153 * vcodec Name of the video codec in use
154 * container Name of the container format
155 * filesize The number of bytes, if known in advance
156 * filesize_approx An estimate for the number of bytes
157 * player_url SWF Player URL (used for rtmpdump).
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
160 "http", "https", "rtsp", "rtmp", "rtmpe",
161 "m3u8", "m3u8_native" or "http_dash_segments".
162 * fragment_base_url
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
165 this URL.
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
174 fragment_base_url
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
177 * preference Order number of this format. If this field is
178 present and not None, the formats get sorted
179 by this field, regardless of all other values.
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
186 the URL?
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
200 * stretched_ratio If given and not 1, indicates that the
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
207
208 url: Final video URL.
209 ext: Video filename extension.
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
212
213 The following fields are optional:
214
215 alt_title: A secondary title of the video.
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
220 thumbnails: A list of dictionaries, with the following entries:
221 * "id" (optional, string) - Thumbnail format ID
222 * "url"
223 * "preference" (optional, int) - quality of the image
224 * "width" (optional, int)
225 * "height" (optional, int)
226 * "resolution" (optional, string "{width}x{height}",
227 deprecated)
228 * "filesize" (optional, int)
229 thumbnail: Full URL to a video thumbnail image.
230 description: Full video description.
231 uploader: Full name of the video uploader.
232 license: License name the video is licensed under.
233 creator: The creator of the video.
234 release_timestamp: UNIX timestamp of the moment the video was released.
235 release_date: The date (YYYYMMDD) when the video was released.
236 timestamp: UNIX timestamp of the moment the video became available
237 (uploaded).
238 upload_date: Video upload date (YYYYMMDD).
239 If not explicitly set, calculated from timestamp.
240 uploader_id: Nickname or id of the video uploader.
241 uploader_url: Full URL to a personal webpage of the video uploader.
242 channel: Full name of the channel the video is uploaded on.
243 Note that channel fields may or may not repeat uploader
244 fields. This depends on a particular extractor.
245 channel_id: Id of the channel.
246 channel_url: Full URL to a channel webpage.
247 location: Physical location where the video was filmed.
248 subtitles: The available subtitles as a dictionary in the format
249 {tag: subformats}. "tag" is usually a language code, and
250 "subformats" is a list sorted from lower to higher
251 preference, each element is a dictionary with the "ext"
252 entry and one of:
253 * "data": The subtitles file contents
254 * "url": A URL pointing to the subtitles file
255 "ext" will be calculated from URL if missing
256 automatic_captions: Like 'subtitles', used by the YoutubeIE for
257 automatically generated captions
258 duration: Length of the video in seconds, as an integer or float.
259 view_count: How many users have watched the video on the platform.
260 like_count: Number of positive ratings of the video
261 dislike_count: Number of negative ratings of the video
262 repost_count: Number of reposts of the video
263 average_rating: Average rating give by users, the scale used depends on the webpage
264 comment_count: Number of comments on the video
265 comments: A list of comments, each with one or more of the following
266 properties (all but one of text or html optional):
267 * "author" - human-readable name of the comment author
268 * "author_id" - user ID of the comment author
269 * "id" - Comment ID
270 * "html" - Comment as HTML
271 * "text" - Plain text of the comment
272 * "timestamp" - UNIX timestamp of comment
273 * "parent" - ID of the comment this one is replying to.
274 Set to "root" to indicate that this is a
275 comment to the original video.
276 age_limit: Age restriction for the video, as an integer (years)
277 webpage_url: The URL to the video webpage, if given to youtube-dl it
278 should allow to get the same result again. (It will be set
279 by YoutubeDL if it's missing)
280 categories: A list of categories that the video falls in, for example
281 ["Sports", "Berlin"]
282 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
283 is_live: True, False, or None (=unknown). Whether this video is a
284 live stream that goes on instead of a fixed-length video.
285 start_time: Time in seconds where the reproduction should start, as
286 specified in the URL.
287 end_time: Time in seconds where the reproduction should end, as
288 specified in the URL.
289 chapters: A list of dictionaries, with the following entries:
290 * "start_time" - The start time of the chapter in seconds
291 * "end_time" - The end time of the chapter in seconds
292 * "title" (optional, string)
293
294 The following fields should only be used when the video belongs to some logical
295 chapter or section:
296
297 chapter: Name or title of the chapter the video belongs to.
298 chapter_number: Number of the chapter the video belongs to, as an integer.
299 chapter_id: Id of the chapter the video belongs to, as a unicode string.
300
301 The following fields should only be used when the video is an episode of some
302 series, programme or podcast:
303
304 series: Title of the series or programme the video episode belongs to.
305 season: Title of the season the video episode belongs to.
306 season_number: Number of the season the video episode belongs to, as an integer.
307 season_id: Id of the season the video episode belongs to, as a unicode string.
308 episode: Title of the video episode. Unlike mandatory video title field,
309 this field should denote the exact title of the video episode
310 without any kind of decoration.
311 episode_number: Number of the video episode within a season, as an integer.
312 episode_id: Id of the video episode, as a unicode string.
313
314 The following fields should only be used when the media is a track or a part of
315 a music album:
316
317 track: Title of the track.
318 track_number: Number of the track within an album or a disc, as an integer.
319 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
320 as a unicode string.
321 artist: Artist(s) of the track.
322 genre: Genre(s) of the track.
323 album: Title of the album the track belongs to.
324 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
325 album_artist: List of all artists appeared on the album (e.g.
326 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
327 and compilations).
328 disc_number: Number of the disc or other physical medium the track belongs to,
329 as an integer.
330 release_year: Year (YYYY) when the album was released.
331
332 Unless mentioned otherwise, the fields should be Unicode strings.
333
334 Unless mentioned otherwise, None is equivalent to absence of information.
335
336
337 _type "playlist" indicates multiple videos.
338 There must be a key "entries", which is a list, an iterable, or a PagedList
339 object, each element of which is a valid dictionary by this specification.
340
341 Additionally, playlists can have "id", "title", "description", "uploader",
342 "uploader_id", "uploader_url", "duration" attributes with the same semantics
343 as videos (see above).
344
345
346 _type "multi_video" indicates that there are multiple videos that
347 form a single show, for examples multiple acts of an opera or TV episode.
348 It must have an entries key like a playlist and contain all the keys
349 required for a video at the same time.
350
351
352 _type "url" indicates that the video must be extracted from another
353 location, possibly by a different extractor. Its only required key is:
354 "url" - the next URL to extract.
355 The key "ie_key" can be set to the class name (minus the trailing "IE",
356 e.g. "Youtube") if the extractor class is known in advance.
357 Additionally, the dictionary may have any properties of the resolved entity
358 known in advance, for example "title" if the title of the referred video is
359 known ahead of time.
360
361
362 _type "url_transparent" entities have the same specification as "url", but
363 indicate that the given additional information is more precise than the one
364 associated with the resolved URL.
365 This is useful when a site employs a video service that hosts the video and
366 its technical metadata, but that video service does not embed a useful
367 title, description etc.
368
369
370 Subclasses of this one should re-define the _real_initialize() and
371 _real_extract() methods and define a _VALID_URL regexp.
372 Probably, they should also be added to the list of extractors.
373
374 _GEO_BYPASS attribute may be set to False in order to disable
375 geo restriction bypass mechanisms for a particular extractor.
376 Though it won't disable explicit geo restriction bypass based on
377 country code provided with geo_bypass_country.
378
379 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
380 countries for this extractor. One of these countries will be used by
381 geo restriction bypass mechanism right away in order to bypass
382 geo restriction, of course, if the mechanism is not disabled.
383
384 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
385 IP blocks in CIDR notation for this extractor. One of these IP blocks
386 will be used by geo restriction bypass mechanism similarly
387 to _GEO_COUNTRIES.
388
389 Finally, the _WORKING attribute should be set to False for broken IEs
390 in order to warn the users and skip the tests.
391 """
392
393 _ready = False
394 _downloader = None
395 _x_forwarded_for_ip = None
396 _GEO_BYPASS = True
397 _GEO_COUNTRIES = None
398 _GEO_IP_BLOCKS = None
399 _WORKING = True
400
401 def __init__(self, downloader=None):
402 """Constructor. Receives an optional downloader."""
403 self._ready = False
404 self._x_forwarded_for_ip = None
405 self.set_downloader(downloader)
406
407 @classmethod
408 def suitable(cls, url):
409 """Receives a URL and returns True if suitable for this IE."""
410
411 # This does not use has/getattr intentionally - we want to know whether
412 # we have cached the regexp for *this* class, whereas getattr would also
413 # match the superclass
414 if '_VALID_URL_RE' not in cls.__dict__:
415 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
416 return cls._VALID_URL_RE.match(url) is not None
417
418 @classmethod
419 def _match_id(cls, url):
420 if '_VALID_URL_RE' not in cls.__dict__:
421 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
422 m = cls._VALID_URL_RE.match(url)
423 assert m
424 return compat_str(m.group('id'))
425
426 @classmethod
427 def working(cls):
428 """Getter method for _WORKING."""
429 return cls._WORKING
430
431 def initialize(self):
432 """Initializes an instance (authentication, etc)."""
433 self._initialize_geo_bypass({
434 'countries': self._GEO_COUNTRIES,
435 'ip_blocks': self._GEO_IP_BLOCKS,
436 })
437 if not self._ready:
438 self._real_initialize()
439 self._ready = True
440
441 def _initialize_geo_bypass(self, geo_bypass_context):
442 """
443 Initialize geo restriction bypass mechanism.
444
445 This method is used to initialize geo bypass mechanism based on faking
446 X-Forwarded-For HTTP header. A random country from provided country list
447 is selected and a random IP belonging to this country is generated. This
448 IP will be passed as X-Forwarded-For HTTP header in all subsequent
449 HTTP requests.
450
451 This method will be used for initial geo bypass mechanism initialization
452 during the instance initialization with _GEO_COUNTRIES and
453 _GEO_IP_BLOCKS.
454
455 You may also manually call it from extractor's code if geo bypass
456 information is not available beforehand (e.g. obtained during
457 extraction) or due to some other reason. In this case you should pass
458 this information in geo bypass context passed as first argument. It may
459 contain following fields:
460
461 countries: List of geo unrestricted countries (similar
462 to _GEO_COUNTRIES)
463 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
464 (similar to _GEO_IP_BLOCKS)
465
466 """
467 if not self._x_forwarded_for_ip:
468
469 # Geo bypass mechanism is explicitly disabled by user
470 if not self._downloader.params.get('geo_bypass', True):
471 return
472
473 if not geo_bypass_context:
474 geo_bypass_context = {}
475
476 # Backward compatibility: previously _initialize_geo_bypass
477 # expected a list of countries, some 3rd party code may still use
478 # it this way
479 if isinstance(geo_bypass_context, (list, tuple)):
480 geo_bypass_context = {
481 'countries': geo_bypass_context,
482 }
483
484 # The whole point of geo bypass mechanism is to fake IP
485 # as X-Forwarded-For HTTP header based on some IP block or
486 # country code.
487
488 # Path 1: bypassing based on IP block in CIDR notation
489
490 # Explicit IP block specified by user, use it right away
491 # regardless of whether extractor is geo bypassable or not
492 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
493
494 # Otherwise use random IP block from geo bypass context but only
495 # if extractor is known as geo bypassable
496 if not ip_block:
497 ip_blocks = geo_bypass_context.get('ip_blocks')
498 if self._GEO_BYPASS and ip_blocks:
499 ip_block = random.choice(ip_blocks)
500
501 if ip_block:
502 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
503 if self._downloader.params.get('verbose', False):
504 self._downloader.to_screen(
505 '[debug] Using fake IP %s as X-Forwarded-For.'
506 % self._x_forwarded_for_ip)
507 return
508
509 # Path 2: bypassing based on country code
510
511 # Explicit country code specified by user, use it right away
512 # regardless of whether extractor is geo bypassable or not
513 country = self._downloader.params.get('geo_bypass_country', None)
514
515 # Otherwise use random country code from geo bypass context but
516 # only if extractor is known as geo bypassable
517 if not country:
518 countries = geo_bypass_context.get('countries')
519 if self._GEO_BYPASS and countries:
520 country = random.choice(countries)
521
522 if country:
523 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
524 if self._downloader.params.get('verbose', False):
525 self._downloader.to_screen(
526 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
527 % (self._x_forwarded_for_ip, country.upper()))
528
529 def extract(self, url):
530 """Extracts URL information and returns it in list of dicts."""
531 try:
532 for _ in range(2):
533 try:
534 self.initialize()
535 ie_result = self._real_extract(url)
536 if self._x_forwarded_for_ip:
537 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
538 return ie_result
539 except GeoRestrictedError as e:
540 if self.__maybe_fake_ip_and_retry(e.countries):
541 continue
542 raise
543 except ExtractorError:
544 raise
545 except compat_http_client.IncompleteRead as e:
546 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
547 except (KeyError, StopIteration) as e:
548 raise ExtractorError('An extractor error has occurred.', cause=e)
549
550 def __maybe_fake_ip_and_retry(self, countries):
551 if (not self._downloader.params.get('geo_bypass_country', None)
552 and self._GEO_BYPASS
553 and self._downloader.params.get('geo_bypass', True)
554 and not self._x_forwarded_for_ip
555 and countries):
556 country_code = random.choice(countries)
557 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
558 if self._x_forwarded_for_ip:
559 self.report_warning(
560 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
561 % (self._x_forwarded_for_ip, country_code.upper()))
562 return True
563 return False
564
565 def set_downloader(self, downloader):
566 """Sets the downloader for this IE."""
567 self._downloader = downloader
568
569 def _real_initialize(self):
570 """Real initialization process. Redefine in subclasses."""
571 pass
572
573 def _real_extract(self, url):
574 """Real extraction process. Redefine in subclasses."""
575 pass
576
577 @classmethod
578 def ie_key(cls):
579 """A string for getting the InfoExtractor with get_info_extractor"""
580 return compat_str(cls.__name__[:-2])
581
582 @property
583 def IE_NAME(self):
584 return compat_str(type(self).__name__[:-2])
585
586 @staticmethod
587 def __can_accept_status_code(err, expected_status):
588 assert isinstance(err, compat_urllib_error.HTTPError)
589 if expected_status is None:
590 return False
591 if isinstance(expected_status, compat_integer_types):
592 return err.code == expected_status
593 elif isinstance(expected_status, (list, tuple)):
594 return err.code in expected_status
595 elif callable(expected_status):
596 return expected_status(err.code) is True
597 else:
598 assert False
599
600 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
601 """
602 Return the response handle.
603
604 See _download_webpage docstring for arguments specification.
605 """
606 if note is None:
607 self.report_download_webpage(video_id)
608 elif note is not False:
609 if video_id is None:
610 self.to_screen('%s' % (note,))
611 else:
612 self.to_screen('%s: %s' % (video_id, note))
613
614 # Some sites check X-Forwarded-For HTTP header in order to figure out
615 # the origin of the client behind proxy. This allows bypassing geo
616 # restriction by faking this header's value to IP that belongs to some
617 # geo unrestricted country. We will do so once we encounter any
618 # geo restriction error.
619 if self._x_forwarded_for_ip:
620 if 'X-Forwarded-For' not in headers:
621 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
622
623 if isinstance(url_or_request, compat_urllib_request.Request):
624 url_or_request = update_Request(
625 url_or_request, data=data, headers=headers, query=query)
626 else:
627 if query:
628 url_or_request = update_url_query(url_or_request, query)
629 if data is not None or headers:
630 url_or_request = sanitized_Request(url_or_request, data, headers)
631 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
632 if hasattr(ssl, 'CertificateError'):
633 exceptions.append(ssl.CertificateError)
634 try:
635 return self._downloader.urlopen(url_or_request)
636 except tuple(exceptions) as err:
637 if isinstance(err, compat_urllib_error.HTTPError):
638 if self.__can_accept_status_code(err, expected_status):
639 # Retain reference to error to prevent file object from
640 # being closed before it can be read. Works around the
641 # effects of <https://bugs.python.org/issue15002>
642 # introduced in Python 3.4.1.
643 err.fp._error = err
644 return err.fp
645
646 if errnote is False:
647 return False
648 if errnote is None:
649 errnote = 'Unable to download webpage'
650
651 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
652 if fatal:
653 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
654 else:
655 self._downloader.report_warning(errmsg)
656 return False
657
658 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
659 """
660 Return a tuple (page content as string, URL handle).
661
662 See _download_webpage docstring for arguments specification.
663 """
664 # Strip hashes from the URL (#1038)
665 if isinstance(url_or_request, (compat_str, str)):
666 url_or_request = url_or_request.partition('#')[0]
667
668 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
669 if urlh is False:
670 assert not fatal
671 return False
672 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
673 return (content, urlh)
674
675 @staticmethod
676 def _guess_encoding_from_content(content_type, webpage_bytes):
677 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
678 if m:
679 encoding = m.group(1)
680 else:
681 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
682 webpage_bytes[:1024])
683 if m:
684 encoding = m.group(1).decode('ascii')
685 elif webpage_bytes.startswith(b'\xff\xfe'):
686 encoding = 'utf-16'
687 else:
688 encoding = 'utf-8'
689
690 return encoding
691
692 def __check_blocked(self, content):
693 first_block = content[:512]
694 if ('<title>Access to this site is blocked</title>' in content
695 and 'Websense' in first_block):
696 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
697 blocked_iframe = self._html_search_regex(
698 r'<iframe src="([^"]+)"', content,
699 'Websense information URL', default=None)
700 if blocked_iframe:
701 msg += ' Visit %s for more details' % blocked_iframe
702 raise ExtractorError(msg, expected=True)
703 if '<title>The URL you requested has been blocked</title>' in first_block:
704 msg = (
705 'Access to this webpage has been blocked by Indian censorship. '
706 'Use a VPN or proxy server (with --proxy) to route around it.')
707 block_msg = self._html_search_regex(
708 r'</h1><p>(.*?)</p>',
709 content, 'block message', default=None)
710 if block_msg:
711 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
712 raise ExtractorError(msg, expected=True)
713 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
714 and 'blocklist.rkn.gov.ru' in content):
715 raise ExtractorError(
716 'Access to this webpage has been blocked by decision of the Russian government. '
717 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
718 expected=True)
719
720 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
721 content_type = urlh.headers.get('Content-Type', '')
722 webpage_bytes = urlh.read()
723 if prefix is not None:
724 webpage_bytes = prefix + webpage_bytes
725 if not encoding:
726 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
727 if self._downloader.params.get('dump_intermediate_pages', False):
728 self.to_screen('Dumping request to ' + urlh.geturl())
729 dump = base64.b64encode(webpage_bytes).decode('ascii')
730 self._downloader.to_screen(dump)
731 if self._downloader.params.get('write_pages', False):
732 basen = '%s_%s' % (video_id, urlh.geturl())
733 if len(basen) > 240:
734 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
735 basen = basen[:240 - len(h)] + h
736 raw_filename = basen + '.dump'
737 filename = sanitize_filename(raw_filename, restricted=True)
738 self.to_screen('Saving request to ' + filename)
739 # Working around MAX_PATH limitation on Windows (see
740 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
741 if compat_os_name == 'nt':
742 absfilepath = os.path.abspath(filename)
743 if len(absfilepath) > 259:
744 filename = '\\\\?\\' + absfilepath
745 with open(filename, 'wb') as outf:
746 outf.write(webpage_bytes)
747
748 try:
749 content = webpage_bytes.decode(encoding, 'replace')
750 except LookupError:
751 content = webpage_bytes.decode('utf-8', 'replace')
752
753 self.__check_blocked(content)
754
755 return content
756
757 def _download_webpage(
758 self, url_or_request, video_id, note=None, errnote=None,
759 fatal=True, tries=1, timeout=5, encoding=None, data=None,
760 headers={}, query={}, expected_status=None):
761 """
762 Return the data of the page as a string.
763
764 Arguments:
765 url_or_request -- plain text URL as a string or
766 a compat_urllib_request.Requestobject
767 video_id -- Video/playlist/item identifier (string)
768
769 Keyword arguments:
770 note -- note printed before downloading (string)
771 errnote -- note printed in case of an error (string)
772 fatal -- flag denoting whether error should be considered fatal,
773 i.e. whether it should cause ExtractionError to be raised,
774 otherwise a warning will be reported and extraction continued
775 tries -- number of tries
776 timeout -- sleep interval between tries
777 encoding -- encoding for a page content decoding, guessed automatically
778 when not explicitly specified
779 data -- POST data (bytes)
780 headers -- HTTP headers (dict)
781 query -- URL query (dict)
782 expected_status -- allows to accept failed HTTP requests (non 2xx
783 status code) by explicitly specifying a set of accepted status
784 codes. Can be any of the following entities:
785 - an integer type specifying an exact failed status code to
786 accept
787 - a list or a tuple of integer types specifying a list of
788 failed status codes to accept
789 - a callable accepting an actual failed status code and
790 returning True if it should be accepted
791 Note that this argument does not affect success status codes (2xx)
792 which are always accepted.
793 """
794
795 success = False
796 try_count = 0
797 while success is False:
798 try:
799 res = self._download_webpage_handle(
800 url_or_request, video_id, note, errnote, fatal,
801 encoding=encoding, data=data, headers=headers, query=query,
802 expected_status=expected_status)
803 success = True
804 except compat_http_client.IncompleteRead as e:
805 try_count += 1
806 if try_count >= tries:
807 raise e
808 self._sleep(timeout, video_id)
809 if res is False:
810 return res
811 else:
812 content, _ = res
813 return content
814
815 def _download_xml_handle(
816 self, url_or_request, video_id, note='Downloading XML',
817 errnote='Unable to download XML', transform_source=None,
818 fatal=True, encoding=None, data=None, headers={}, query={},
819 expected_status=None):
820 """
821 Return a tuple (xml as an compat_etree_Element, URL handle).
822
823 See _download_webpage docstring for arguments specification.
824 """
825 res = self._download_webpage_handle(
826 url_or_request, video_id, note, errnote, fatal=fatal,
827 encoding=encoding, data=data, headers=headers, query=query,
828 expected_status=expected_status)
829 if res is False:
830 return res
831 xml_string, urlh = res
832 return self._parse_xml(
833 xml_string, video_id, transform_source=transform_source,
834 fatal=fatal), urlh
835
836 def _download_xml(
837 self, url_or_request, video_id,
838 note='Downloading XML', errnote='Unable to download XML',
839 transform_source=None, fatal=True, encoding=None,
840 data=None, headers={}, query={}, expected_status=None):
841 """
842 Return the xml as an compat_etree_Element.
843
844 See _download_webpage docstring for arguments specification.
845 """
846 res = self._download_xml_handle(
847 url_or_request, video_id, note=note, errnote=errnote,
848 transform_source=transform_source, fatal=fatal, encoding=encoding,
849 data=data, headers=headers, query=query,
850 expected_status=expected_status)
851 return res if res is False else res[0]
852
853 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
854 if transform_source:
855 xml_string = transform_source(xml_string)
856 try:
857 return compat_etree_fromstring(xml_string.encode('utf-8'))
858 except compat_xml_parse_error as ve:
859 errmsg = '%s: Failed to parse XML ' % video_id
860 if fatal:
861 raise ExtractorError(errmsg, cause=ve)
862 else:
863 self.report_warning(errmsg + str(ve))
864
865 def _download_json_handle(
866 self, url_or_request, video_id, note='Downloading JSON metadata',
867 errnote='Unable to download JSON metadata', transform_source=None,
868 fatal=True, encoding=None, data=None, headers={}, query={},
869 expected_status=None):
870 """
871 Return a tuple (JSON object, URL handle).
872
873 See _download_webpage docstring for arguments specification.
874 """
875 res = self._download_webpage_handle(
876 url_or_request, video_id, note, errnote, fatal=fatal,
877 encoding=encoding, data=data, headers=headers, query=query,
878 expected_status=expected_status)
879 if res is False:
880 return res
881 json_string, urlh = res
882 return self._parse_json(
883 json_string, video_id, transform_source=transform_source,
884 fatal=fatal), urlh
885
886 def _download_json(
887 self, url_or_request, video_id, note='Downloading JSON metadata',
888 errnote='Unable to download JSON metadata', transform_source=None,
889 fatal=True, encoding=None, data=None, headers={}, query={},
890 expected_status=None):
891 """
892 Return the JSON object as a dict.
893
894 See _download_webpage docstring for arguments specification.
895 """
896 res = self._download_json_handle(
897 url_or_request, video_id, note=note, errnote=errnote,
898 transform_source=transform_source, fatal=fatal, encoding=encoding,
899 data=data, headers=headers, query=query,
900 expected_status=expected_status)
901 return res if res is False else res[0]
902
903 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
904 if transform_source:
905 json_string = transform_source(json_string)
906 try:
907 return json.loads(json_string)
908 except ValueError as ve:
909 errmsg = '%s: Failed to parse JSON ' % video_id
910 if fatal:
911 raise ExtractorError(errmsg, cause=ve)
912 else:
913 self.report_warning(errmsg + str(ve))
914
915 def report_warning(self, msg, video_id=None):
916 idstr = '' if video_id is None else '%s: ' % video_id
917 self._downloader.report_warning(
918 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
919
920 def to_screen(self, msg):
921 """Print msg to screen, prefixing it with '[ie_name]'"""
922 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
923
924 def report_extraction(self, id_or_name):
925 """Report information extraction."""
926 self.to_screen('%s: Extracting information' % id_or_name)
927
928 def report_download_webpage(self, video_id):
929 """Report webpage download."""
930 self.to_screen('%s: Downloading webpage' % video_id)
931
932 def report_age_confirmation(self):
933 """Report attempt to confirm age."""
934 self.to_screen('Confirming age')
935
936 def report_login(self):
937 """Report attempt to log in."""
938 self.to_screen('Logging in')
939
940 @staticmethod
941 def raise_login_required(msg='This video is only available for registered users'):
942 raise ExtractorError(
943 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
944 expected=True)
945
946 @staticmethod
947 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
948 raise GeoRestrictedError(msg, countries=countries)
949
950 # Methods for following #608
951 @staticmethod
952 def url_result(url, ie=None, video_id=None, video_title=None):
953 """Returns a URL that points to a page that should be processed"""
954 # TODO: ie should be the class used for getting the info
955 video_info = {'_type': 'url',
956 'url': url,
957 'ie_key': ie}
958 if video_id is not None:
959 video_info['id'] = video_id
960 if video_title is not None:
961 video_info['title'] = video_title
962 return video_info
963
964 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
965 urls = orderedSet(
966 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
967 for m in matches)
968 return self.playlist_result(
969 urls, playlist_id=playlist_id, playlist_title=playlist_title)
970
971 @staticmethod
972 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
973 """Returns a playlist"""
974 video_info = {'_type': 'playlist',
975 'entries': entries}
976 if playlist_id:
977 video_info['id'] = playlist_id
978 if playlist_title:
979 video_info['title'] = playlist_title
980 if playlist_description:
981 video_info['description'] = playlist_description
982 return video_info
983
984 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
985 """
986 Perform a regex search on the given string, using a single or a list of
987 patterns returning the first matching group.
988 In case of failure return a default value or raise a WARNING or a
989 RegexNotFoundError, depending on fatal, specifying the field name.
990 """
991 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
992 mobj = re.search(pattern, string, flags)
993 else:
994 for p in pattern:
995 mobj = re.search(p, string, flags)
996 if mobj:
997 break
998
999 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1000 _name = '\033[0;34m%s\033[0m' % name
1001 else:
1002 _name = name
1003
1004 if mobj:
1005 if group is None:
1006 # return the first matching group
1007 return next(g for g in mobj.groups() if g is not None)
1008 else:
1009 return mobj.group(group)
1010 elif default is not NO_DEFAULT:
1011 return default
1012 elif fatal:
1013 raise RegexNotFoundError('Unable to extract %s' % _name)
1014 else:
1015 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1016 return None
1017
1018 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1019 """
1020 Like _search_regex, but strips HTML tags and unescapes entities.
1021 """
1022 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1023 if res:
1024 return clean_html(res).strip()
1025 else:
1026 return res
1027
1028 def _get_netrc_login_info(self, netrc_machine=None):
1029 username = None
1030 password = None
1031 netrc_machine = netrc_machine or self._NETRC_MACHINE
1032
1033 if self._downloader.params.get('usenetrc', False):
1034 try:
1035 info = netrc.netrc().authenticators(netrc_machine)
1036 if info is not None:
1037 username = info[0]
1038 password = info[2]
1039 else:
1040 raise netrc.NetrcParseError(
1041 'No authenticators for %s' % netrc_machine)
1042 except (IOError, netrc.NetrcParseError) as err:
1043 self._downloader.report_warning(
1044 'parsing .netrc: %s' % error_to_compat_str(err))
1045
1046 return username, password
1047
1048 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1049 """
1050 Get the login info as (username, password)
1051 First look for the manually specified credentials using username_option
1052 and password_option as keys in params dictionary. If no such credentials
1053 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1054 value.
1055 If there's no info available, return (None, None)
1056 """
1057 if self._downloader is None:
1058 return (None, None)
1059
1060 downloader_params = self._downloader.params
1061
1062 # Attempt to use provided username and password or .netrc data
1063 if downloader_params.get(username_option) is not None:
1064 username = downloader_params[username_option]
1065 password = downloader_params[password_option]
1066 else:
1067 username, password = self._get_netrc_login_info(netrc_machine)
1068
1069 return username, password
1070
1071 def _get_tfa_info(self, note='two-factor verification code'):
1072 """
1073 Get the two-factor authentication info
1074 TODO - asking the user will be required for sms/phone verify
1075 currently just uses the command line option
1076 If there's no info available, return None
1077 """
1078 if self._downloader is None:
1079 return None
1080 downloader_params = self._downloader.params
1081
1082 if downloader_params.get('twofactor') is not None:
1083 return downloader_params['twofactor']
1084
1085 return compat_getpass('Type %s and press [Return]: ' % note)
1086
1087 # Helper functions for extracting OpenGraph info
1088 @staticmethod
1089 def _og_regexes(prop):
1090 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1091 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1092 % {'prop': re.escape(prop)})
1093 template = r'<meta[^>]+?%s[^>]+?%s'
1094 return [
1095 template % (property_re, content_re),
1096 template % (content_re, property_re),
1097 ]
1098
1099 @staticmethod
1100 def _meta_regex(prop):
1101 return r'''(?isx)<meta
1102 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1103 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1104
1105 def _og_search_property(self, prop, html, name=None, **kargs):
1106 if not isinstance(prop, (list, tuple)):
1107 prop = [prop]
1108 if name is None:
1109 name = 'OpenGraph %s' % prop[0]
1110 og_regexes = []
1111 for p in prop:
1112 og_regexes.extend(self._og_regexes(p))
1113 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1114 if escaped is None:
1115 return None
1116 return unescapeHTML(escaped)
1117
1118 def _og_search_thumbnail(self, html, **kargs):
1119 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1120
1121 def _og_search_description(self, html, **kargs):
1122 return self._og_search_property('description', html, fatal=False, **kargs)
1123
1124 def _og_search_title(self, html, **kargs):
1125 return self._og_search_property('title', html, **kargs)
1126
1127 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1128 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1129 if secure:
1130 regexes = self._og_regexes('video:secure_url') + regexes
1131 return self._html_search_regex(regexes, html, name, **kargs)
1132
1133 def _og_search_url(self, html, **kargs):
1134 return self._og_search_property('url', html, **kargs)
1135
1136 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1137 if not isinstance(name, (list, tuple)):
1138 name = [name]
1139 if display_name is None:
1140 display_name = name[0]
1141 return self._html_search_regex(
1142 [self._meta_regex(n) for n in name],
1143 html, display_name, fatal=fatal, group='content', **kwargs)
1144
1145 def _dc_search_uploader(self, html):
1146 return self._html_search_meta('dc.creator', html, 'uploader')
1147
1148 def _rta_search(self, html):
1149 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1150 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1151 r' content="RTA-5042-1996-1400-1577-RTA"',
1152 html):
1153 return 18
1154 return 0
1155
1156 def _media_rating_search(self, html):
1157 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1158 rating = self._html_search_meta('rating', html)
1159
1160 if not rating:
1161 return None
1162
1163 RATING_TABLE = {
1164 'safe for kids': 0,
1165 'general': 8,
1166 '14 years': 14,
1167 'mature': 17,
1168 'restricted': 19,
1169 }
1170 return RATING_TABLE.get(rating.lower())
1171
1172 def _family_friendly_search(self, html):
1173 # See http://schema.org/VideoObject
1174 family_friendly = self._html_search_meta(
1175 'isFamilyFriendly', html, default=None)
1176
1177 if not family_friendly:
1178 return None
1179
1180 RATING_TABLE = {
1181 '1': 0,
1182 'true': 0,
1183 '0': 18,
1184 'false': 18,
1185 }
1186 return RATING_TABLE.get(family_friendly.lower())
1187
1188 def _twitter_search_player(self, html):
1189 return self._html_search_meta('twitter:player', html,
1190 'twitter card player')
1191
1192 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1193 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1194 default = kwargs.get('default', NO_DEFAULT)
1195 # JSON-LD may be malformed and thus `fatal` should be respected.
1196 # At the same time `default` may be passed that assumes `fatal=False`
1197 # for _search_regex. Let's simulate the same behavior here as well.
1198 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1199 json_ld = []
1200 for mobj in json_ld_list:
1201 json_ld_item = self._parse_json(
1202 mobj.group('json_ld'), video_id, fatal=fatal)
1203 if not json_ld_item:
1204 continue
1205 if isinstance(json_ld_item, dict):
1206 json_ld.append(json_ld_item)
1207 elif isinstance(json_ld_item, (list, tuple)):
1208 json_ld.extend(json_ld_item)
1209 if json_ld:
1210 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1211 if json_ld:
1212 return json_ld
1213 if default is not NO_DEFAULT:
1214 return default
1215 elif fatal:
1216 raise RegexNotFoundError('Unable to extract JSON-LD')
1217 else:
1218 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1219 return {}
1220
1221 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1222 if isinstance(json_ld, compat_str):
1223 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1224 if not json_ld:
1225 return {}
1226 info = {}
1227 if not isinstance(json_ld, (list, tuple, dict)):
1228 return info
1229 if isinstance(json_ld, dict):
1230 json_ld = [json_ld]
1231
1232 INTERACTION_TYPE_MAP = {
1233 'CommentAction': 'comment',
1234 'AgreeAction': 'like',
1235 'DisagreeAction': 'dislike',
1236 'LikeAction': 'like',
1237 'DislikeAction': 'dislike',
1238 'ListenAction': 'view',
1239 'WatchAction': 'view',
1240 'ViewAction': 'view',
1241 }
1242
1243 def extract_interaction_type(e):
1244 interaction_type = e.get('interactionType')
1245 if isinstance(interaction_type, dict):
1246 interaction_type = interaction_type.get('@type')
1247 return str_or_none(interaction_type)
1248
1249 def extract_interaction_statistic(e):
1250 interaction_statistic = e.get('interactionStatistic')
1251 if isinstance(interaction_statistic, dict):
1252 interaction_statistic = [interaction_statistic]
1253 if not isinstance(interaction_statistic, list):
1254 return
1255 for is_e in interaction_statistic:
1256 if not isinstance(is_e, dict):
1257 continue
1258 if is_e.get('@type') != 'InteractionCounter':
1259 continue
1260 interaction_type = extract_interaction_type(is_e)
1261 if not interaction_type:
1262 continue
1263 # For interaction count some sites provide string instead of
1264 # an integer (as per spec) with non digit characters (e.g. ",")
1265 # so extracting count with more relaxed str_to_int
1266 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1267 if interaction_count is None:
1268 continue
1269 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1270 if not count_kind:
1271 continue
1272 count_key = '%s_count' % count_kind
1273 if info.get(count_key) is not None:
1274 continue
1275 info[count_key] = interaction_count
1276
1277 def extract_video_object(e):
1278 assert e['@type'] == 'VideoObject'
1279 author = e.get('author')
1280 info.update({
1281 'url': url_or_none(e.get('contentUrl')),
1282 'title': unescapeHTML(e.get('name')),
1283 'description': unescapeHTML(e.get('description')),
1284 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1285 'duration': parse_duration(e.get('duration')),
1286 'timestamp': unified_timestamp(e.get('uploadDate')),
1287 # author can be an instance of 'Organization' or 'Person' types.
1288 # both types can have 'name' property(inherited from 'Thing' type). [1]
1289 # however some websites are using 'Text' type instead.
1290 # 1. https://schema.org/VideoObject
1291 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1292 'filesize': float_or_none(e.get('contentSize')),
1293 'tbr': int_or_none(e.get('bitrate')),
1294 'width': int_or_none(e.get('width')),
1295 'height': int_or_none(e.get('height')),
1296 'view_count': int_or_none(e.get('interactionCount')),
1297 })
1298 extract_interaction_statistic(e)
1299
1300 for e in json_ld:
1301 if '@context' in e:
1302 item_type = e.get('@type')
1303 if expected_type is not None and expected_type != item_type:
1304 continue
1305 if item_type in ('TVEpisode', 'Episode'):
1306 episode_name = unescapeHTML(e.get('name'))
1307 info.update({
1308 'episode': episode_name,
1309 'episode_number': int_or_none(e.get('episodeNumber')),
1310 'description': unescapeHTML(e.get('description')),
1311 })
1312 if not info.get('title') and episode_name:
1313 info['title'] = episode_name
1314 part_of_season = e.get('partOfSeason')
1315 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1316 info.update({
1317 'season': unescapeHTML(part_of_season.get('name')),
1318 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1319 })
1320 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1321 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1322 info['series'] = unescapeHTML(part_of_series.get('name'))
1323 elif item_type == 'Movie':
1324 info.update({
1325 'title': unescapeHTML(e.get('name')),
1326 'description': unescapeHTML(e.get('description')),
1327 'duration': parse_duration(e.get('duration')),
1328 'timestamp': unified_timestamp(e.get('dateCreated')),
1329 })
1330 elif item_type in ('Article', 'NewsArticle'):
1331 info.update({
1332 'timestamp': parse_iso8601(e.get('datePublished')),
1333 'title': unescapeHTML(e.get('headline')),
1334 'description': unescapeHTML(e.get('articleBody')),
1335 })
1336 elif item_type == 'VideoObject':
1337 extract_video_object(e)
1338 if expected_type is None:
1339 continue
1340 else:
1341 break
1342 video = e.get('video')
1343 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1344 extract_video_object(video)
1345 if expected_type is None:
1346 continue
1347 else:
1348 break
1349 return dict((k, v) for k, v in info.items() if v is not None)
1350
1351 @staticmethod
1352 def _hidden_inputs(html):
1353 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1354 hidden_inputs = {}
1355 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1356 attrs = extract_attributes(input)
1357 if not input:
1358 continue
1359 if attrs.get('type') not in ('hidden', 'submit'):
1360 continue
1361 name = attrs.get('name') or attrs.get('id')
1362 value = attrs.get('value')
1363 if name and value is not None:
1364 hidden_inputs[name] = value
1365 return hidden_inputs
1366
1367 def _form_hidden_inputs(self, form_id, html):
1368 form = self._search_regex(
1369 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1370 html, '%s form' % form_id, group='form')
1371 return self._hidden_inputs(form)
1372
1373 def _sort_formats(self, formats, field_preference=None):
1374 if not formats:
1375 raise ExtractorError('No video formats found')
1376
1377 for f in formats:
1378 # Automatically determine tbr when missing based on abr and vbr (improves
1379 # formats sorting in some cases)
1380 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1381 f['tbr'] = f['abr'] + f['vbr']
1382
1383 def _formats_key(f):
1384 # TODO remove the following workaround
1385 from ..utils import determine_ext
1386 if not f.get('ext') and 'url' in f:
1387 f['ext'] = determine_ext(f['url'])
1388
1389 if isinstance(field_preference, (list, tuple)):
1390 return tuple(
1391 f.get(field)
1392 if f.get(field) is not None
1393 else ('' if field == 'format_id' else -1)
1394 for field in field_preference)
1395
1396 preference = f.get('preference')
1397 if preference is None:
1398 preference = 0
1399 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1400 preference -= 0.5
1401
1402 protocol = f.get('protocol') or determine_protocol(f)
1403 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1404
1405 if f.get('vcodec') == 'none': # audio only
1406 preference -= 50
1407 if self._downloader.params.get('prefer_free_formats'):
1408 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1409 else:
1410 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1411 ext_preference = 0
1412 try:
1413 audio_ext_preference = ORDER.index(f['ext'])
1414 except ValueError:
1415 audio_ext_preference = -1
1416 else:
1417 if f.get('acodec') == 'none': # video only
1418 preference -= 40
1419 if self._downloader.params.get('prefer_free_formats'):
1420 ORDER = ['flv', 'mp4', 'webm']
1421 else:
1422 ORDER = ['webm', 'flv', 'mp4']
1423 try:
1424 ext_preference = ORDER.index(f['ext'])
1425 except ValueError:
1426 ext_preference = -1
1427 audio_ext_preference = 0
1428
1429 return (
1430 preference,
1431 f.get('language_preference') if f.get('language_preference') is not None else -1,
1432 f.get('quality') if f.get('quality') is not None else -1,
1433 f.get('tbr') if f.get('tbr') is not None else -1,
1434 f.get('filesize') if f.get('filesize') is not None else -1,
1435 f.get('vbr') if f.get('vbr') is not None else -1,
1436 f.get('height') if f.get('height') is not None else -1,
1437 f.get('width') if f.get('width') is not None else -1,
1438 proto_preference,
1439 ext_preference,
1440 f.get('abr') if f.get('abr') is not None else -1,
1441 audio_ext_preference,
1442 f.get('fps') if f.get('fps') is not None else -1,
1443 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1444 f.get('source_preference') if f.get('source_preference') is not None else -1,
1445 f.get('format_id') if f.get('format_id') is not None else '',
1446 )
1447 formats.sort(key=_formats_key)
1448
1449 def _check_formats(self, formats, video_id):
1450 if formats:
1451 formats[:] = filter(
1452 lambda f: self._is_valid_url(
1453 f['url'], video_id,
1454 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1455 formats)
1456
1457 @staticmethod
1458 def _remove_duplicate_formats(formats):
1459 format_urls = set()
1460 unique_formats = []
1461 for f in formats:
1462 if f['url'] not in format_urls:
1463 format_urls.add(f['url'])
1464 unique_formats.append(f)
1465 formats[:] = unique_formats
1466
1467 def _is_valid_url(self, url, video_id, item='video', headers={}):
1468 url = self._proto_relative_url(url, scheme='http:')
1469 # For now assume non HTTP(S) URLs always valid
1470 if not (url.startswith('http://') or url.startswith('https://')):
1471 return True
1472 try:
1473 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1474 return True
1475 except ExtractorError as e:
1476 self.to_screen(
1477 '%s: %s URL is invalid, skipping: %s'
1478 % (video_id, item, error_to_compat_str(e.cause)))
1479 return False
1480
1481 def http_scheme(self):
1482 """ Either "http:" or "https:", depending on the user's preferences """
1483 return (
1484 'http:'
1485 if self._downloader.params.get('prefer_insecure', False)
1486 else 'https:')
1487
1488 def _proto_relative_url(self, url, scheme=None):
1489 if url is None:
1490 return url
1491 if url.startswith('//'):
1492 if scheme is None:
1493 scheme = self.http_scheme()
1494 return scheme + url
1495 else:
1496 return url
1497
1498 def _sleep(self, timeout, video_id, msg_template=None):
1499 if msg_template is None:
1500 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1501 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1502 self.to_screen(msg)
1503 time.sleep(timeout)
1504
1505 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1506 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1507 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1508 manifest = self._download_xml(
1509 manifest_url, video_id, 'Downloading f4m manifest',
1510 'Unable to download f4m manifest',
1511 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1512 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1513 transform_source=transform_source,
1514 fatal=fatal, data=data, headers=headers, query=query)
1515
1516 if manifest is False:
1517 return []
1518
1519 return self._parse_f4m_formats(
1520 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1521 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1522
1523 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1524 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1525 fatal=True, m3u8_id=None):
1526 if not isinstance(manifest, compat_etree_Element) and not fatal:
1527 return []
1528
1529 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1530 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1531 if akamai_pv is not None and ';' in akamai_pv.text:
1532 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1533 if playerVerificationChallenge.strip() != '':
1534 return []
1535
1536 formats = []
1537 manifest_version = '1.0'
1538 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1539 if not media_nodes:
1540 manifest_version = '2.0'
1541 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1542 # Remove unsupported DRM protected media from final formats
1543 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1544 media_nodes = remove_encrypted_media(media_nodes)
1545 if not media_nodes:
1546 return formats
1547
1548 manifest_base_url = get_base_url(manifest)
1549
1550 bootstrap_info = xpath_element(
1551 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1552 'bootstrap info', default=None)
1553
1554 vcodec = None
1555 mime_type = xpath_text(
1556 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1557 'base URL', default=None)
1558 if mime_type and mime_type.startswith('audio/'):
1559 vcodec = 'none'
1560
1561 for i, media_el in enumerate(media_nodes):
1562 tbr = int_or_none(media_el.attrib.get('bitrate'))
1563 width = int_or_none(media_el.attrib.get('width'))
1564 height = int_or_none(media_el.attrib.get('height'))
1565 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1566 # If <bootstrapInfo> is present, the specified f4m is a
1567 # stream-level manifest, and only set-level manifests may refer to
1568 # external resources. See section 11.4 and section 4 of F4M spec
1569 if bootstrap_info is None:
1570 media_url = None
1571 # @href is introduced in 2.0, see section 11.6 of F4M spec
1572 if manifest_version == '2.0':
1573 media_url = media_el.attrib.get('href')
1574 if media_url is None:
1575 media_url = media_el.attrib.get('url')
1576 if not media_url:
1577 continue
1578 manifest_url = (
1579 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1580 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1581 # If media_url is itself a f4m manifest do the recursive extraction
1582 # since bitrates in parent manifest (this one) and media_url manifest
1583 # may differ leading to inability to resolve the format by requested
1584 # bitrate in f4m downloader
1585 ext = determine_ext(manifest_url)
1586 if ext == 'f4m':
1587 f4m_formats = self._extract_f4m_formats(
1588 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1589 transform_source=transform_source, fatal=fatal)
1590 # Sometimes stream-level manifest contains single media entry that
1591 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1592 # At the same time parent's media entry in set-level manifest may
1593 # contain it. We will copy it from parent in such cases.
1594 if len(f4m_formats) == 1:
1595 f = f4m_formats[0]
1596 f.update({
1597 'tbr': f.get('tbr') or tbr,
1598 'width': f.get('width') or width,
1599 'height': f.get('height') or height,
1600 'format_id': f.get('format_id') if not tbr else format_id,
1601 'vcodec': vcodec,
1602 })
1603 formats.extend(f4m_formats)
1604 continue
1605 elif ext == 'm3u8':
1606 formats.extend(self._extract_m3u8_formats(
1607 manifest_url, video_id, 'mp4', preference=preference,
1608 m3u8_id=m3u8_id, fatal=fatal))
1609 continue
1610 formats.append({
1611 'format_id': format_id,
1612 'url': manifest_url,
1613 'manifest_url': manifest_url,
1614 'ext': 'flv' if bootstrap_info is not None else None,
1615 'protocol': 'f4m',
1616 'tbr': tbr,
1617 'width': width,
1618 'height': height,
1619 'vcodec': vcodec,
1620 'preference': preference,
1621 })
1622 return formats
1623
1624 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1625 return {
1626 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1627 'url': m3u8_url,
1628 'ext': ext,
1629 'protocol': 'm3u8',
1630 'preference': preference - 100 if preference else -100,
1631 'resolution': 'multiple',
1632 'format_note': 'Quality selection URL',
1633 }
1634
1635 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1636 entry_protocol='m3u8', preference=None,
1637 m3u8_id=None, note=None, errnote=None,
1638 fatal=True, live=False, data=None, headers={},
1639 query={}):
1640 res = self._download_webpage_handle(
1641 m3u8_url, video_id,
1642 note=note or 'Downloading m3u8 information',
1643 errnote=errnote or 'Failed to download m3u8 information',
1644 fatal=fatal, data=data, headers=headers, query=query)
1645
1646 if res is False:
1647 return []
1648
1649 m3u8_doc, urlh = res
1650 m3u8_url = urlh.geturl()
1651
1652 return self._parse_m3u8_formats(
1653 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1654 preference=preference, m3u8_id=m3u8_id, live=live)
1655
1656 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1657 entry_protocol='m3u8', preference=None,
1658 m3u8_id=None, live=False):
1659 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1660 return []
1661
1662 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1663 return []
1664
1665 formats = []
1666
1667 format_url = lambda u: (
1668 u
1669 if re.match(r'^https?://', u)
1670 else compat_urlparse.urljoin(m3u8_url, u))
1671
1672 # References:
1673 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1674 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1675 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1676
1677 # We should try extracting formats only from master playlists [1, 4.3.4],
1678 # i.e. playlists that describe available qualities. On the other hand
1679 # media playlists [1, 4.3.3] should be returned as is since they contain
1680 # just the media without qualities renditions.
1681 # Fortunately, master playlist can be easily distinguished from media
1682 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1683 # master playlist tags MUST NOT appear in a media playlist and vice versa.
1684 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1685 # media playlist and MUST NOT appear in master playlist thus we can
1686 # clearly detect media playlist with this criterion.
1687
1688 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1689 return [{
1690 'url': m3u8_url,
1691 'format_id': m3u8_id,
1692 'ext': ext,
1693 'protocol': entry_protocol,
1694 'preference': preference,
1695 }]
1696
1697 groups = {}
1698 last_stream_inf = {}
1699
1700 def extract_media(x_media_line):
1701 media = parse_m3u8_attributes(x_media_line)
1702 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1703 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1704 if not (media_type and group_id and name):
1705 return
1706 groups.setdefault(group_id, []).append(media)
1707 if media_type not in ('VIDEO', 'AUDIO'):
1708 return
1709 media_url = media.get('URI')
1710 if media_url:
1711 format_id = []
1712 for v in (m3u8_id, group_id, name):
1713 if v:
1714 format_id.append(v)
1715 f = {
1716 'format_id': '-'.join(format_id),
1717 'url': format_url(media_url),
1718 'manifest_url': m3u8_url,
1719 'language': media.get('LANGUAGE'),
1720 'ext': ext,
1721 'protocol': entry_protocol,
1722 'preference': preference,
1723 }
1724 if media_type == 'AUDIO':
1725 f['vcodec'] = 'none'
1726 formats.append(f)
1727
1728 def build_stream_name():
1729 # Despite specification does not mention NAME attribute for
1730 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1731 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1732 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1733 stream_name = last_stream_inf.get('NAME')
1734 if stream_name:
1735 return stream_name
1736 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1737 # from corresponding rendition group
1738 stream_group_id = last_stream_inf.get('VIDEO')
1739 if not stream_group_id:
1740 return
1741 stream_group = groups.get(stream_group_id)
1742 if not stream_group:
1743 return stream_group_id
1744 rendition = stream_group[0]
1745 return rendition.get('NAME') or stream_group_id
1746
1747 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1748 # chance to detect video only formats when EXT-X-STREAM-INF tags
1749 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1750 for line in m3u8_doc.splitlines():
1751 if line.startswith('#EXT-X-MEDIA:'):
1752 extract_media(line)
1753
1754 for line in m3u8_doc.splitlines():
1755 if line.startswith('#EXT-X-STREAM-INF:'):
1756 last_stream_inf = parse_m3u8_attributes(line)
1757 elif line.startswith('#') or not line.strip():
1758 continue
1759 else:
1760 tbr = float_or_none(
1761 last_stream_inf.get('AVERAGE-BANDWIDTH')
1762 or last_stream_inf.get('BANDWIDTH'), scale=1000)
1763 format_id = []
1764 if m3u8_id:
1765 format_id.append(m3u8_id)
1766 stream_name = build_stream_name()
1767 # Bandwidth of live streams may differ over time thus making
1768 # format_id unpredictable. So it's better to keep provided
1769 # format_id intact.
1770 if not live:
1771 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1772 manifest_url = format_url(line.strip())
1773 f = {
1774 'format_id': '-'.join(format_id),
1775 'url': manifest_url,
1776 'manifest_url': m3u8_url,
1777 'tbr': tbr,
1778 'ext': ext,
1779 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1780 'protocol': entry_protocol,
1781 'preference': preference,
1782 }
1783 resolution = last_stream_inf.get('RESOLUTION')
1784 if resolution:
1785 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1786 if mobj:
1787 f['width'] = int(mobj.group('width'))
1788 f['height'] = int(mobj.group('height'))
1789 # Unified Streaming Platform
1790 mobj = re.search(
1791 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1792 if mobj:
1793 abr, vbr = mobj.groups()
1794 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1795 f.update({
1796 'vbr': vbr,
1797 'abr': abr,
1798 })
1799 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1800 f.update(codecs)
1801 audio_group_id = last_stream_inf.get('AUDIO')
1802 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1803 # references a rendition group MUST have a CODECS attribute.
1804 # However, this is not always respected, for example, [2]
1805 # contains EXT-X-STREAM-INF tag which references AUDIO
1806 # rendition group but does not have CODECS and despite
1807 # referencing an audio group it represents a complete
1808 # (with audio and video) format. So, for such cases we will
1809 # ignore references to rendition groups and treat them
1810 # as complete formats.
1811 if audio_group_id and codecs and f.get('vcodec') != 'none':
1812 audio_group = groups.get(audio_group_id)
1813 if audio_group and audio_group[0].get('URI'):
1814 # TODO: update acodec for audio only formats with
1815 # the same GROUP-ID
1816 f['acodec'] = 'none'
1817 formats.append(f)
1818
1819 # for DailyMotion
1820 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
1821 if progressive_uri:
1822 http_f = f.copy()
1823 del http_f['manifest_url']
1824 http_f.update({
1825 'format_id': f['format_id'].replace('hls-', 'http-'),
1826 'protocol': 'http',
1827 'url': progressive_uri,
1828 })
1829 formats.append(http_f)
1830
1831 last_stream_inf = {}
1832 return formats
1833
1834 @staticmethod
1835 def _xpath_ns(path, namespace=None):
1836 if not namespace:
1837 return path
1838 out = []
1839 for c in path.split('/'):
1840 if not c or c == '.':
1841 out.append(c)
1842 else:
1843 out.append('{%s}%s' % (namespace, c))
1844 return '/'.join(out)
1845
1846 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1847 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1848
1849 if smil is False:
1850 assert not fatal
1851 return []
1852
1853 namespace = self._parse_smil_namespace(smil)
1854
1855 return self._parse_smil_formats(
1856 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1857
1858 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1859 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1860 if smil is False:
1861 return {}
1862 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1863
1864 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1865 return self._download_xml(
1866 smil_url, video_id, 'Downloading SMIL file',
1867 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1868
1869 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1870 namespace = self._parse_smil_namespace(smil)
1871
1872 formats = self._parse_smil_formats(
1873 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1874 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1875
1876 video_id = os.path.splitext(url_basename(smil_url))[0]
1877 title = None
1878 description = None
1879 upload_date = None
1880 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1881 name = meta.attrib.get('name')
1882 content = meta.attrib.get('content')
1883 if not name or not content:
1884 continue
1885 if not title and name == 'title':
1886 title = content
1887 elif not description and name in ('description', 'abstract'):
1888 description = content
1889 elif not upload_date and name == 'date':
1890 upload_date = unified_strdate(content)
1891
1892 thumbnails = [{
1893 'id': image.get('type'),
1894 'url': image.get('src'),
1895 'width': int_or_none(image.get('width')),
1896 'height': int_or_none(image.get('height')),
1897 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1898
1899 return {
1900 'id': video_id,
1901 'title': title or video_id,
1902 'description': description,
1903 'upload_date': upload_date,
1904 'thumbnails': thumbnails,
1905 'formats': formats,
1906 'subtitles': subtitles,
1907 }
1908
1909 def _parse_smil_namespace(self, smil):
1910 return self._search_regex(
1911 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1912
1913 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1914 base = smil_url
1915 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1916 b = meta.get('base') or meta.get('httpBase')
1917 if b:
1918 base = b
1919 break
1920
1921 formats = []
1922 rtmp_count = 0
1923 http_count = 0
1924 m3u8_count = 0
1925
1926 srcs = []
1927 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1928 for medium in media:
1929 src = medium.get('src')
1930 if not src or src in srcs:
1931 continue
1932 srcs.append(src)
1933
1934 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1935 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1936 width = int_or_none(medium.get('width'))
1937 height = int_or_none(medium.get('height'))
1938 proto = medium.get('proto')
1939 ext = medium.get('ext')
1940 src_ext = determine_ext(src)
1941 streamer = medium.get('streamer') or base
1942
1943 if proto == 'rtmp' or streamer.startswith('rtmp'):
1944 rtmp_count += 1
1945 formats.append({
1946 'url': streamer,
1947 'play_path': src,
1948 'ext': 'flv',
1949 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1950 'tbr': bitrate,
1951 'filesize': filesize,
1952 'width': width,
1953 'height': height,
1954 })
1955 if transform_rtmp_url:
1956 streamer, src = transform_rtmp_url(streamer, src)
1957 formats[-1].update({
1958 'url': streamer,
1959 'play_path': src,
1960 })
1961 continue
1962
1963 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1964 src_url = src_url.strip()
1965
1966 if proto == 'm3u8' or src_ext == 'm3u8':
1967 m3u8_formats = self._extract_m3u8_formats(
1968 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1969 if len(m3u8_formats) == 1:
1970 m3u8_count += 1
1971 m3u8_formats[0].update({
1972 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1973 'tbr': bitrate,
1974 'width': width,
1975 'height': height,
1976 })
1977 formats.extend(m3u8_formats)
1978 elif src_ext == 'f4m':
1979 f4m_url = src_url
1980 if not f4m_params:
1981 f4m_params = {
1982 'hdcore': '3.2.0',
1983 'plugin': 'flowplayer-3.2.0.1',
1984 }
1985 f4m_url += '&' if '?' in f4m_url else '?'
1986 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1987 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1988 elif src_ext == 'mpd':
1989 formats.extend(self._extract_mpd_formats(
1990 src_url, video_id, mpd_id='dash', fatal=False))
1991 elif re.search(r'\.ism/[Mm]anifest', src_url):
1992 formats.extend(self._extract_ism_formats(
1993 src_url, video_id, ism_id='mss', fatal=False))
1994 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1995 http_count += 1
1996 formats.append({
1997 'url': src_url,
1998 'ext': ext or src_ext or 'flv',
1999 'format_id': 'http-%d' % (bitrate or http_count),
2000 'tbr': bitrate,
2001 'filesize': filesize,
2002 'width': width,
2003 'height': height,
2004 })
2005
2006 return formats
2007
2008 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2009 urls = []
2010 subtitles = {}
2011 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2012 src = textstream.get('src')
2013 if not src or src in urls:
2014 continue
2015 urls.append(src)
2016 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2017 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2018 subtitles.setdefault(lang, []).append({
2019 'url': src,
2020 'ext': ext,
2021 })
2022 return subtitles
2023
2024 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2025 xspf = self._download_xml(
2026 xspf_url, playlist_id, 'Downloading xpsf playlist',
2027 'Unable to download xspf manifest', fatal=fatal)
2028 if xspf is False:
2029 return []
2030 return self._parse_xspf(
2031 xspf, playlist_id, xspf_url=xspf_url,
2032 xspf_base_url=base_url(xspf_url))
2033
2034 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2035 NS_MAP = {
2036 'xspf': 'http://xspf.org/ns/0/',
2037 's1': 'http://static.streamone.nl/player/ns/0',
2038 }
2039
2040 entries = []
2041 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2042 title = xpath_text(
2043 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2044 description = xpath_text(
2045 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2046 thumbnail = xpath_text(
2047 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2048 duration = float_or_none(
2049 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2050
2051 formats = []
2052 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2053 format_url = urljoin(xspf_base_url, location.text)
2054 if not format_url:
2055 continue
2056 formats.append({
2057 'url': format_url,
2058 'manifest_url': xspf_url,
2059 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2060 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2061 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2062 })
2063 self._sort_formats(formats)
2064
2065 entries.append({
2066 'id': playlist_id,
2067 'title': title,
2068 'description': description,
2069 'thumbnail': thumbnail,
2070 'duration': duration,
2071 'formats': formats,
2072 })
2073 return entries
2074
2075 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2076 res = self._download_xml_handle(
2077 mpd_url, video_id,
2078 note=note or 'Downloading MPD manifest',
2079 errnote=errnote or 'Failed to download MPD manifest',
2080 fatal=fatal, data=data, headers=headers, query=query)
2081 if res is False:
2082 return []
2083 mpd_doc, urlh = res
2084 if mpd_doc is None:
2085 return []
2086 mpd_base_url = base_url(urlh.geturl())
2087
2088 return self._parse_mpd_formats(
2089 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2090
2091 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2092 """
2093 Parse formats from MPD manifest.
2094 References:
2095 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2096 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2097 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2098 """
2099 if mpd_doc.get('type') == 'dynamic':
2100 return []
2101
2102 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2103
2104 def _add_ns(path):
2105 return self._xpath_ns(path, namespace)
2106
2107 def is_drm_protected(element):
2108 return element.find(_add_ns('ContentProtection')) is not None
2109
2110 def extract_multisegment_info(element, ms_parent_info):
2111 ms_info = ms_parent_info.copy()
2112
2113 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2114 # common attributes and elements. We will only extract relevant
2115 # for us.
2116 def extract_common(source):
2117 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2118 if segment_timeline is not None:
2119 s_e = segment_timeline.findall(_add_ns('S'))
2120 if s_e:
2121 ms_info['total_number'] = 0
2122 ms_info['s'] = []
2123 for s in s_e:
2124 r = int(s.get('r', 0))
2125 ms_info['total_number'] += 1 + r
2126 ms_info['s'].append({
2127 't': int(s.get('t', 0)),
2128 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2129 'd': int(s.attrib['d']),
2130 'r': r,
2131 })
2132 start_number = source.get('startNumber')
2133 if start_number:
2134 ms_info['start_number'] = int(start_number)
2135 timescale = source.get('timescale')
2136 if timescale:
2137 ms_info['timescale'] = int(timescale)
2138 segment_duration = source.get('duration')
2139 if segment_duration:
2140 ms_info['segment_duration'] = float(segment_duration)
2141
2142 def extract_Initialization(source):
2143 initialization = source.find(_add_ns('Initialization'))
2144 if initialization is not None:
2145 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2146
2147 segment_list = element.find(_add_ns('SegmentList'))
2148 if segment_list is not None:
2149 extract_common(segment_list)
2150 extract_Initialization(segment_list)
2151 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2152 if segment_urls_e:
2153 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2154 else:
2155 segment_template = element.find(_add_ns('SegmentTemplate'))
2156 if segment_template is not None:
2157 extract_common(segment_template)
2158 media = segment_template.get('media')
2159 if media:
2160 ms_info['media'] = media
2161 initialization = segment_template.get('initialization')
2162 if initialization:
2163 ms_info['initialization'] = initialization
2164 else:
2165 extract_Initialization(segment_template)
2166 return ms_info
2167
2168 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2169 formats = []
2170 for period in mpd_doc.findall(_add_ns('Period')):
2171 period_duration = parse_duration(period.get('duration')) or mpd_duration
2172 period_ms_info = extract_multisegment_info(period, {
2173 'start_number': 1,
2174 'timescale': 1,
2175 })
2176 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2177 if is_drm_protected(adaptation_set):
2178 continue
2179 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2180 for representation in adaptation_set.findall(_add_ns('Representation')):
2181 if is_drm_protected(representation):
2182 continue
2183 representation_attrib = adaptation_set.attrib.copy()
2184 representation_attrib.update(representation.attrib)
2185 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2186 mime_type = representation_attrib['mimeType']
2187 content_type = mime_type.split('/')[0]
2188 if content_type == 'text':
2189 # TODO implement WebVTT downloading
2190 pass
2191 elif content_type in ('video', 'audio'):
2192 base_url = ''
2193 for element in (representation, adaptation_set, period, mpd_doc):
2194 base_url_e = element.find(_add_ns('BaseURL'))
2195 if base_url_e is not None:
2196 base_url = base_url_e.text + base_url
2197 if re.match(r'^https?://', base_url):
2198 break
2199 if mpd_base_url and not re.match(r'^https?://', base_url):
2200 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2201 mpd_base_url += '/'
2202 base_url = mpd_base_url + base_url
2203 representation_id = representation_attrib.get('id')
2204 lang = representation_attrib.get('lang')
2205 url_el = representation.find(_add_ns('BaseURL'))
2206 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2207 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2208 f = {
2209 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2210 'manifest_url': mpd_url,
2211 'ext': mimetype2ext(mime_type),
2212 'width': int_or_none(representation_attrib.get('width')),
2213 'height': int_or_none(representation_attrib.get('height')),
2214 'tbr': float_or_none(bandwidth, 1000),
2215 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2216 'fps': int_or_none(representation_attrib.get('frameRate')),
2217 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2218 'format_note': 'DASH %s' % content_type,
2219 'filesize': filesize,
2220 'container': mimetype2ext(mime_type) + '_dash',
2221 }
2222 f.update(parse_codecs(representation_attrib.get('codecs')))
2223 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2224
2225 def prepare_template(template_name, identifiers):
2226 tmpl = representation_ms_info[template_name]
2227 # First of, % characters outside $...$ templates
2228 # must be escaped by doubling for proper processing
2229 # by % operator string formatting used further (see
2230 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2231 t = ''
2232 in_template = False
2233 for c in tmpl:
2234 t += c
2235 if c == '$':
2236 in_template = not in_template
2237 elif c == '%' and not in_template:
2238 t += c
2239 # Next, $...$ templates are translated to their
2240 # %(...) counterparts to be used with % operator
2241 t = t.replace('$RepresentationID$', representation_id)
2242 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2243 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2244 t.replace('$$', '$')
2245 return t
2246
2247 # @initialization is a regular template like @media one
2248 # so it should be handled just the same way (see
2249 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2250 if 'initialization' in representation_ms_info:
2251 initialization_template = prepare_template(
2252 'initialization',
2253 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2254 # $Time$ shall not be included for @initialization thus
2255 # only $Bandwidth$ remains
2256 ('Bandwidth', ))
2257 representation_ms_info['initialization_url'] = initialization_template % {
2258 'Bandwidth': bandwidth,
2259 }
2260
2261 def location_key(location):
2262 return 'url' if re.match(r'^https?://', location) else 'path'
2263
2264 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2265
2266 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2267 media_location_key = location_key(media_template)
2268
2269 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2270 # can't be used at the same time
2271 if '%(Number' in media_template and 's' not in representation_ms_info:
2272 segment_duration = None
2273 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2274 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2275 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2276 representation_ms_info['fragments'] = [{
2277 media_location_key: media_template % {
2278 'Number': segment_number,
2279 'Bandwidth': bandwidth,
2280 },
2281 'duration': segment_duration,
2282 } for segment_number in range(
2283 representation_ms_info['start_number'],
2284 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2285 else:
2286 # $Number*$ or $Time$ in media template with S list available
2287 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2288 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2289 representation_ms_info['fragments'] = []
2290 segment_time = 0
2291 segment_d = None
2292 segment_number = representation_ms_info['start_number']
2293
2294 def add_segment_url():
2295 segment_url = media_template % {
2296 'Time': segment_time,
2297 'Bandwidth': bandwidth,
2298 'Number': segment_number,
2299 }
2300 representation_ms_info['fragments'].append({
2301 media_location_key: segment_url,
2302 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2303 })
2304
2305 for num, s in enumerate(representation_ms_info['s']):
2306 segment_time = s.get('t') or segment_time
2307 segment_d = s['d']
2308 add_segment_url()
2309 segment_number += 1
2310 for r in range(s.get('r', 0)):
2311 segment_time += segment_d
2312 add_segment_url()
2313 segment_number += 1
2314 segment_time += segment_d
2315 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2316 # No media template
2317 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2318 # or any YouTube dashsegments video
2319 fragments = []
2320 segment_index = 0
2321 timescale = representation_ms_info['timescale']
2322 for s in representation_ms_info['s']:
2323 duration = float_or_none(s['d'], timescale)
2324 for r in range(s.get('r', 0) + 1):
2325 segment_uri = representation_ms_info['segment_urls'][segment_index]
2326 fragments.append({
2327 location_key(segment_uri): segment_uri,
2328 'duration': duration,
2329 })
2330 segment_index += 1
2331 representation_ms_info['fragments'] = fragments
2332 elif 'segment_urls' in representation_ms_info:
2333 # Segment URLs with no SegmentTimeline
2334 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2335 # https://github.com/ytdl-org/youtube-dl/pull/14844
2336 fragments = []
2337 segment_duration = float_or_none(
2338 representation_ms_info['segment_duration'],
2339 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2340 for segment_url in representation_ms_info['segment_urls']:
2341 fragment = {
2342 location_key(segment_url): segment_url,
2343 }
2344 if segment_duration:
2345 fragment['duration'] = segment_duration
2346 fragments.append(fragment)
2347 representation_ms_info['fragments'] = fragments
2348 # If there is a fragments key available then we correctly recognized fragmented media.
2349 # Otherwise we will assume unfragmented media with direct access. Technically, such
2350 # assumption is not necessarily correct since we may simply have no support for
2351 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2352 if 'fragments' in representation_ms_info:
2353 f.update({
2354 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2355 'url': mpd_url or base_url,
2356 'fragment_base_url': base_url,
2357 'fragments': [],
2358 'protocol': 'http_dash_segments',
2359 })
2360 if 'initialization_url' in representation_ms_info:
2361 initialization_url = representation_ms_info['initialization_url']
2362 if not f.get('url'):
2363 f['url'] = initialization_url
2364 f['fragments'].append({location_key(initialization_url): initialization_url})
2365 f['fragments'].extend(representation_ms_info['fragments'])
2366 else:
2367 # Assuming direct URL to unfragmented media.
2368 f['url'] = base_url
2369 formats.append(f)
2370 else:
2371 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2372 return formats
2373
2374 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2375 res = self._download_xml_handle(
2376 ism_url, video_id,
2377 note=note or 'Downloading ISM manifest',
2378 errnote=errnote or 'Failed to download ISM manifest',
2379 fatal=fatal, data=data, headers=headers, query=query)
2380 if res is False:
2381 return []
2382 ism_doc, urlh = res
2383 if ism_doc is None:
2384 return []
2385
2386 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2387
2388 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2389 """
2390 Parse formats from ISM manifest.
2391 References:
2392 1. [MS-SSTR]: Smooth Streaming Protocol,
2393 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2394 """
2395 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2396 return []
2397
2398 duration = int(ism_doc.attrib['Duration'])
2399 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2400
2401 formats = []
2402 for stream in ism_doc.findall('StreamIndex'):
2403 stream_type = stream.get('Type')
2404 if stream_type not in ('video', 'audio'):
2405 continue
2406 url_pattern = stream.attrib['Url']
2407 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2408 stream_name = stream.get('Name')
2409 for track in stream.findall('QualityLevel'):
2410 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2411 # TODO: add support for WVC1 and WMAP
2412 if fourcc not in ('H264', 'AVC1', 'AACL'):
2413 self.report_warning('%s is not a supported codec' % fourcc)
2414 continue
2415 tbr = int(track.attrib['Bitrate']) // 1000
2416 # [1] does not mention Width and Height attributes. However,
2417 # they're often present while MaxWidth and MaxHeight are
2418 # missing, so should be used as fallbacks
2419 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2420 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2421 sampling_rate = int_or_none(track.get('SamplingRate'))
2422
2423 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2424 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2425
2426 fragments = []
2427 fragment_ctx = {
2428 'time': 0,
2429 }
2430 stream_fragments = stream.findall('c')
2431 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2432 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2433 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2434 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2435 if not fragment_ctx['duration']:
2436 try:
2437 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2438 except IndexError:
2439 next_fragment_time = duration
2440 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2441 for _ in range(fragment_repeat):
2442 fragments.append({
2443 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2444 'duration': fragment_ctx['duration'] / stream_timescale,
2445 })
2446 fragment_ctx['time'] += fragment_ctx['duration']
2447
2448 format_id = []
2449 if ism_id:
2450 format_id.append(ism_id)
2451 if stream_name:
2452 format_id.append(stream_name)
2453 format_id.append(compat_str(tbr))
2454
2455 formats.append({
2456 'format_id': '-'.join(format_id),
2457 'url': ism_url,
2458 'manifest_url': ism_url,
2459 'ext': 'ismv' if stream_type == 'video' else 'isma',
2460 'width': width,
2461 'height': height,
2462 'tbr': tbr,
2463 'asr': sampling_rate,
2464 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2465 'acodec': 'none' if stream_type == 'video' else fourcc,
2466 'protocol': 'ism',
2467 'fragments': fragments,
2468 '_download_params': {
2469 'duration': duration,
2470 'timescale': stream_timescale,
2471 'width': width or 0,
2472 'height': height or 0,
2473 'fourcc': fourcc,
2474 'codec_private_data': track.get('CodecPrivateData'),
2475 'sampling_rate': sampling_rate,
2476 'channels': int_or_none(track.get('Channels', 2)),
2477 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2478 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2479 },
2480 })
2481 return formats
2482
2483 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2484 def absolute_url(item_url):
2485 return urljoin(base_url, item_url)
2486
2487 def parse_content_type(content_type):
2488 if not content_type:
2489 return {}
2490 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2491 if ctr:
2492 mimetype, codecs = ctr.groups()
2493 f = parse_codecs(codecs)
2494 f['ext'] = mimetype2ext(mimetype)
2495 return f
2496 return {}
2497
2498 def _media_formats(src, cur_media_type, type_info={}):
2499 full_url = absolute_url(src)
2500 ext = type_info.get('ext') or determine_ext(full_url)
2501 if ext == 'm3u8':
2502 is_plain_url = False
2503 formats = self._extract_m3u8_formats(
2504 full_url, video_id, ext='mp4',
2505 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2506 preference=preference, fatal=False)
2507 elif ext == 'mpd':
2508 is_plain_url = False
2509 formats = self._extract_mpd_formats(
2510 full_url, video_id, mpd_id=mpd_id, fatal=False)
2511 else:
2512 is_plain_url = True
2513 formats = [{
2514 'url': full_url,
2515 'vcodec': 'none' if cur_media_type == 'audio' else None,
2516 }]
2517 return is_plain_url, formats
2518
2519 entries = []
2520 # amp-video and amp-audio are very similar to their HTML5 counterparts
2521 # so we wll include them right here (see
2522 # https://www.ampproject.org/docs/reference/components/amp-video)
2523 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2524 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2525 media_tags = [(media_tag, media_tag_name, media_type, '')
2526 for media_tag, media_tag_name, media_type
2527 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2528 media_tags.extend(re.findall(
2529 # We only allow video|audio followed by a whitespace or '>'.
2530 # Allowing more characters may end up in significant slow down (see
2531 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2532 # http://www.porntrex.com/maps/videositemap.xml).
2533 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2534 for media_tag, _, media_type, media_content in media_tags:
2535 media_info = {
2536 'formats': [],
2537 'subtitles': {},
2538 }
2539 media_attributes = extract_attributes(media_tag)
2540 src = strip_or_none(media_attributes.get('src'))
2541 if src:
2542 _, formats = _media_formats(src, media_type)
2543 media_info['formats'].extend(formats)
2544 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2545 if media_content:
2546 for source_tag in re.findall(r'<source[^>]+>', media_content):
2547 s_attr = extract_attributes(source_tag)
2548 # data-video-src and data-src are non standard but seen
2549 # several times in the wild
2550 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2551 if not src:
2552 continue
2553 f = parse_content_type(s_attr.get('type'))
2554 is_plain_url, formats = _media_formats(src, media_type, f)
2555 if is_plain_url:
2556 # width, height, res, label and title attributes are
2557 # all not standard but seen several times in the wild
2558 labels = [
2559 s_attr.get(lbl)
2560 for lbl in ('label', 'title')
2561 if str_or_none(s_attr.get(lbl))
2562 ]
2563 width = int_or_none(s_attr.get('width'))
2564 height = (int_or_none(s_attr.get('height'))
2565 or int_or_none(s_attr.get('res')))
2566 if not width or not height:
2567 for lbl in labels:
2568 resolution = parse_resolution(lbl)
2569 if not resolution:
2570 continue
2571 width = width or resolution.get('width')
2572 height = height or resolution.get('height')
2573 for lbl in labels:
2574 tbr = parse_bitrate(lbl)
2575 if tbr:
2576 break
2577 else:
2578 tbr = None
2579 f.update({
2580 'width': width,
2581 'height': height,
2582 'tbr': tbr,
2583 'format_id': s_attr.get('label') or s_attr.get('title'),
2584 })
2585 f.update(formats[0])
2586 media_info['formats'].append(f)
2587 else:
2588 media_info['formats'].extend(formats)
2589 for track_tag in re.findall(r'<track[^>]+>', media_content):
2590 track_attributes = extract_attributes(track_tag)
2591 kind = track_attributes.get('kind')
2592 if not kind or kind in ('subtitles', 'captions'):
2593 src = strip_or_none(track_attributes.get('src'))
2594 if not src:
2595 continue
2596 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2597 media_info['subtitles'].setdefault(lang, []).append({
2598 'url': absolute_url(src),
2599 })
2600 for f in media_info['formats']:
2601 f.setdefault('http_headers', {})['Referer'] = base_url
2602 if media_info['formats'] or media_info['subtitles']:
2603 entries.append(media_info)
2604 return entries
2605
2606 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2607 signed = 'hdnea=' in manifest_url
2608 if not signed:
2609 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2610 manifest_url = re.sub(
2611 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2612 '', manifest_url).strip('?')
2613
2614 formats = []
2615
2616 hdcore_sign = 'hdcore=3.7.0'
2617 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2618 hds_host = hosts.get('hds')
2619 if hds_host:
2620 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2621 if 'hdcore=' not in f4m_url:
2622 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2623 f4m_formats = self._extract_f4m_formats(
2624 f4m_url, video_id, f4m_id='hds', fatal=False)
2625 for entry in f4m_formats:
2626 entry.update({'extra_param_to_segment_url': hdcore_sign})
2627 formats.extend(f4m_formats)
2628
2629 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2630 hls_host = hosts.get('hls')
2631 if hls_host:
2632 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2633 m3u8_formats = self._extract_m3u8_formats(
2634 m3u8_url, video_id, 'mp4', 'm3u8_native',
2635 m3u8_id='hls', fatal=False)
2636 formats.extend(m3u8_formats)
2637
2638 http_host = hosts.get('http')
2639 if http_host and m3u8_formats and not signed:
2640 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2641 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2642 qualities_length = len(qualities)
2643 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2644 i = 0
2645 for f in m3u8_formats:
2646 if f['vcodec'] != 'none':
2647 for protocol in ('http', 'https'):
2648 http_f = f.copy()
2649 del http_f['manifest_url']
2650 http_url = re.sub(
2651 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2652 http_f.update({
2653 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2654 'url': http_url,
2655 'protocol': protocol,
2656 })
2657 formats.append(http_f)
2658 i += 1
2659
2660 return formats
2661
2662 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2663 query = compat_urlparse.urlparse(url).query
2664 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2665 mobj = re.search(
2666 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2667 url_base = mobj.group('url')
2668 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2669 formats = []
2670
2671 def manifest_url(manifest):
2672 m_url = '%s/%s' % (http_base_url, manifest)
2673 if query:
2674 m_url += '?%s' % query
2675 return m_url
2676
2677 if 'm3u8' not in skip_protocols:
2678 formats.extend(self._extract_m3u8_formats(
2679 manifest_url('playlist.m3u8'), video_id, 'mp4',
2680 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2681 if 'f4m' not in skip_protocols:
2682 formats.extend(self._extract_f4m_formats(
2683 manifest_url('manifest.f4m'),
2684 video_id, f4m_id='hds', fatal=False))
2685 if 'dash' not in skip_protocols:
2686 formats.extend(self._extract_mpd_formats(
2687 manifest_url('manifest.mpd'),
2688 video_id, mpd_id='dash', fatal=False))
2689 if re.search(r'(?:/smil:|\.smil)', url_base):
2690 if 'smil' not in skip_protocols:
2691 rtmp_formats = self._extract_smil_formats(
2692 manifest_url('jwplayer.smil'),
2693 video_id, fatal=False)
2694 for rtmp_format in rtmp_formats:
2695 rtsp_format = rtmp_format.copy()
2696 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2697 del rtsp_format['play_path']
2698 del rtsp_format['ext']
2699 rtsp_format.update({
2700 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2701 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2702 'protocol': 'rtsp',
2703 })
2704 formats.extend([rtmp_format, rtsp_format])
2705 else:
2706 for protocol in ('rtmp', 'rtsp'):
2707 if protocol not in skip_protocols:
2708 formats.append({
2709 'url': '%s:%s' % (protocol, url_base),
2710 'format_id': protocol,
2711 'protocol': protocol,
2712 })
2713 return formats
2714
2715 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2716 mobj = re.search(
2717 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
2718 webpage)
2719 if mobj:
2720 try:
2721 jwplayer_data = self._parse_json(mobj.group('options'),
2722 video_id=video_id,
2723 transform_source=transform_source)
2724 except ExtractorError:
2725 pass
2726 else:
2727 if isinstance(jwplayer_data, dict):
2728 return jwplayer_data
2729
2730 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2731 jwplayer_data = self._find_jwplayer_data(
2732 webpage, video_id, transform_source=js_to_json)
2733 return self._parse_jwplayer_data(
2734 jwplayer_data, video_id, *args, **kwargs)
2735
2736 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2737 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2738 flat_pl = try_get(jwplayer_data, lambda x: x.get('playlist') or True)
2739 if flat_pl is None:
2740 # not even a dict
2741 return []
2742
2743 # JWPlayer backward compatibility: flattened playlists
2744 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2745 if flat_pl is True:
2746 jwplayer_data = {'playlist': [jwplayer_data]}
2747
2748 entries = []
2749
2750 # JWPlayer backward compatibility: single playlist item
2751 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2752 if not isinstance(jwplayer_data['playlist'], list):
2753 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2754
2755 for video_data in jwplayer_data['playlist']:
2756 # JWPlayer backward compatibility: flattened sources
2757 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2758 if 'sources' not in video_data:
2759 video_data['sources'] = [video_data]
2760
2761 this_video_id = video_id or video_data['mediaid']
2762
2763 formats = self._parse_jwplayer_formats(
2764 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2765 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2766
2767 subtitles = {}
2768 tracks = video_data.get('tracks')
2769 if tracks and isinstance(tracks, list):
2770 for track in tracks:
2771 if not isinstance(track, dict):
2772 continue
2773 track_kind = track.get('kind')
2774 if not track_kind or not isinstance(track_kind, compat_str):
2775 continue
2776 if track_kind.lower() not in ('captions', 'subtitles'):
2777 continue
2778 track_url = urljoin(base_url, track.get('file'))
2779 if not track_url:
2780 continue
2781 subtitles.setdefault(track.get('label') or 'en', []).append({
2782 'url': self._proto_relative_url(track_url)
2783 })
2784
2785 entry = {
2786 'id': this_video_id,
2787 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2788 'description': clean_html(video_data.get('description')),
2789 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2790 'timestamp': int_or_none(video_data.get('pubdate')),
2791 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2792 'subtitles': subtitles,
2793 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
2794 'genre': clean_html(video_data.get('genre')),
2795 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
2796 'season_number': int_or_none(video_data.get('season')),
2797 'episode_number': int_or_none(video_data.get('episode')),
2798 'release_year': int_or_none(video_data.get('releasedate')),
2799 'age_limit': int_or_none(video_data.get('age_restriction')),
2800 }
2801 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2802 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2803 entry.update({
2804 '_type': 'url_transparent',
2805 'url': formats[0]['url'],
2806 })
2807 else:
2808 # avoid exception in case of only sttls
2809 if formats:
2810 self._sort_formats(formats)
2811 entry['formats'] = formats
2812 entries.append(entry)
2813 if len(entries) == 1:
2814 return entries[0]
2815 else:
2816 return self.playlist_result(entries)
2817
2818 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2819 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2820 urls = set()
2821 formats = []
2822 for source in jwplayer_sources_data:
2823 if not isinstance(source, dict):
2824 continue
2825 source_url = urljoin(
2826 base_url, self._proto_relative_url(source.get('file')))
2827 if not source_url or source_url in urls:
2828 continue
2829 urls.add(source_url)
2830 source_type = source.get('type') or ''
2831 ext = mimetype2ext(source_type) or determine_ext(source_url)
2832 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
2833 formats.extend(self._extract_m3u8_formats(
2834 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2835 m3u8_id=m3u8_id, fatal=False))
2836 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
2837 formats.extend(self._extract_mpd_formats(
2838 source_url, video_id, mpd_id=mpd_id, fatal=False))
2839 elif ext == 'smil':
2840 formats.extend(self._extract_smil_formats(
2841 source_url, video_id, fatal=False))
2842 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2843 elif source_type.startswith('audio') or ext in (
2844 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2845 formats.append({
2846 'url': source_url,
2847 'vcodec': 'none',
2848 'ext': ext,
2849 })
2850 else:
2851 format_id = str_or_none(source.get('label'))
2852 height = int_or_none(source.get('height'))
2853 if height is None and format_id:
2854 # Often no height is provided but there is a label in
2855 # format like "1080p", "720p SD", or 1080.
2856 height = parse_resolution(format_id).get('height')
2857 a_format = {
2858 'url': source_url,
2859 'width': int_or_none(source.get('width')),
2860 'height': height,
2861 'tbr': int_or_none(source.get('bitrate'), scale=1000),
2862 'filesize': int_or_none(source.get('filesize')),
2863 'ext': ext,
2864 }
2865 if format_id:
2866 a_format['format_id'] = format_id
2867
2868 if source_url.startswith('rtmp'):
2869 a_format['ext'] = 'flv'
2870 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2871 # of jwplayer.flash.swf
2872 rtmp_url_parts = re.split(
2873 r'((?:mp4|mp3|flv):)', source_url, 1)
2874 if len(rtmp_url_parts) == 3:
2875 rtmp_url, prefix, play_path = rtmp_url_parts
2876 a_format.update({
2877 'url': rtmp_url,
2878 'play_path': prefix + play_path,
2879 })
2880 if rtmp_params:
2881 a_format.update(rtmp_params)
2882 formats.append(a_format)
2883 return formats
2884
2885 def _live_title(self, name):
2886 """ Generate the title for a live video """
2887 now = datetime.datetime.now()
2888 now_str = now.strftime('%Y-%m-%d %H:%M')
2889 return name + ' ' + now_str
2890
2891 def _int(self, v, name, fatal=False, **kwargs):
2892 res = int_or_none(v, **kwargs)
2893 if 'get_attr' in kwargs:
2894 print(getattr(v, kwargs['get_attr']))
2895 if res is None:
2896 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2897 if fatal:
2898 raise ExtractorError(msg)
2899 else:
2900 self._downloader.report_warning(msg)
2901 return res
2902
2903 def _float(self, v, name, fatal=False, **kwargs):
2904 res = float_or_none(v, **kwargs)
2905 if res is None:
2906 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2907 if fatal:
2908 raise ExtractorError(msg)
2909 else:
2910 self._downloader.report_warning(msg)
2911 return res
2912
2913 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2914 path='/', secure=False, discard=False, rest={}, **kwargs):
2915 cookie = compat_cookiejar_Cookie(
2916 0, name, value, port, port is not None, domain, True,
2917 domain.startswith('.'), path, True, secure, expire_time,
2918 discard, None, None, rest)
2919 self._downloader.cookiejar.set_cookie(cookie)
2920
2921 def _get_cookies(self, url):
2922 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
2923 req = sanitized_Request(url)
2924 self._downloader.cookiejar.add_cookie_header(req)
2925 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
2926
2927 def _apply_first_set_cookie_header(self, url_handle, cookie):
2928 """
2929 Apply first Set-Cookie header instead of the last. Experimental.
2930
2931 Some sites (e.g. [1-3]) may serve two cookies under the same name
2932 in Set-Cookie header and expect the first (old) one to be set rather
2933 than second (new). However, as of RFC6265 the newer one cookie
2934 should be set into cookie store what actually happens.
2935 We will workaround this issue by resetting the cookie to
2936 the first one manually.
2937 1. https://new.vk.com/
2938 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
2939 3. https://learning.oreilly.com/
2940 """
2941 for header, cookies in url_handle.headers.items():
2942 if header.lower() != 'set-cookie':
2943 continue
2944 if sys.version_info[0] >= 3:
2945 cookies = cookies.encode('iso-8859-1')
2946 cookies = cookies.decode('utf-8')
2947 cookie_value = re.search(
2948 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
2949 if cookie_value:
2950 value, domain = cookie_value.groups()
2951 self._set_cookie(domain, cookie, value)
2952 break
2953
2954 def get_testcases(self, include_onlymatching=False):
2955 t = getattr(self, '_TEST', None)
2956 if t:
2957 assert not hasattr(self, '_TESTS'), \
2958 '%s has _TEST and _TESTS' % type(self).__name__
2959 tests = [t]
2960 else:
2961 tests = getattr(self, '_TESTS', [])
2962 for t in tests:
2963 if not include_onlymatching and t.get('only_matching', False):
2964 continue
2965 t['name'] = type(self).__name__[:-len('IE')]
2966 yield t
2967
2968 def is_suitable(self, age_limit):
2969 """ Test whether the extractor is generally suitable for the given
2970 age limit (i.e. pornographic sites are not, all others usually are) """
2971
2972 any_restricted = False
2973 for tc in self.get_testcases(include_onlymatching=False):
2974 if tc.get('playlist', []):
2975 tc = tc['playlist'][0]
2976 is_restricted = age_restricted(
2977 tc.get('info_dict', {}).get('age_limit'), age_limit)
2978 if not is_restricted:
2979 return True
2980 any_restricted = any_restricted or is_restricted
2981 return not any_restricted
2982
2983 def extract_subtitles(self, *args, **kwargs):
2984 if (self._downloader.params.get('writesubtitles', False)
2985 or self._downloader.params.get('listsubtitles')):
2986 return self._get_subtitles(*args, **kwargs)
2987 return {}
2988
2989 def _get_subtitles(self, *args, **kwargs):
2990 raise NotImplementedError('This method must be implemented by subclasses')
2991
2992 @staticmethod
2993 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2994 """ Merge subtitle items for one language. Items with duplicated URLs
2995 will be dropped. """
2996 list1_urls = set([item['url'] for item in subtitle_list1])
2997 ret = list(subtitle_list1)
2998 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2999 return ret
3000
3001 @classmethod
3002 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3003 """ Merge two subtitle dictionaries, language by language. """
3004 ret = dict(subtitle_dict1)
3005 for lang in subtitle_dict2:
3006 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3007 return ret
3008
3009 def extract_automatic_captions(self, *args, **kwargs):
3010 if (self._downloader.params.get('writeautomaticsub', False)
3011 or self._downloader.params.get('listsubtitles')):
3012 return self._get_automatic_captions(*args, **kwargs)
3013 return {}
3014
3015 def _get_automatic_captions(self, *args, **kwargs):
3016 raise NotImplementedError('This method must be implemented by subclasses')
3017
3018 def mark_watched(self, *args, **kwargs):
3019 if (self._downloader.params.get('mark_watched', False)
3020 and (self._get_login_info()[0] is not None
3021 or self._downloader.params.get('cookiefile') is not None)):
3022 self._mark_watched(*args, **kwargs)
3023
3024 def _mark_watched(self, *args, **kwargs):
3025 raise NotImplementedError('This method must be implemented by subclasses')
3026
3027 def geo_verification_headers(self):
3028 headers = {}
3029 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3030 if geo_verification_proxy:
3031 headers['Ytdl-request-proxy'] = geo_verification_proxy
3032 return headers
3033
3034 def _generic_id(self, url):
3035 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3036
3037 def _generic_title(self, url):
3038 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3039
3040
3041 class SearchInfoExtractor(InfoExtractor):
3042 """
3043 Base class for paged search queries extractors.
3044 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3045 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3046 """
3047
3048 @classmethod
3049 def _make_valid_url(cls):
3050 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3051
3052 @classmethod
3053 def suitable(cls, url):
3054 return re.match(cls._make_valid_url(), url) is not None
3055
3056 def _real_extract(self, query):
3057 mobj = re.match(self._make_valid_url(), query)
3058 if mobj is None:
3059 raise ExtractorError('Invalid search query "%s"' % query)
3060
3061 prefix = mobj.group('prefix')
3062 query = mobj.group('query')
3063 if prefix == '':
3064 return self._get_n_results(query, 1)
3065 elif prefix == 'all':
3066 return self._get_n_results(query, self._MAX_RESULTS)
3067 else:
3068 n = int(prefix)
3069 if n <= 0:
3070 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3071 elif n > self._MAX_RESULTS:
3072 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3073 n = self._MAX_RESULTS
3074 return self._get_n_results(query, n)
3075
3076 def _get_n_results(self, query, n):
3077 """Get a specified number of results for a query"""
3078 raise NotImplementedError('This method must be implemented by subclasses')
3079
3080 @property
3081 def SEARCH_KEY(self):
3082 return self._SEARCH_KEY
|