1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
54
55 The fields should all be Unicode strings.
56
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
60
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
63
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
67
68 _ready = False
69 _downloader = None
70 _WORKING = True
71
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
76
77 @classmethod
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
81
82 @classmethod
83 def working(cls):
84 """Getter method for _WORKING."""
85 return cls._WORKING
86
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
92
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
97
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
101
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
105
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
109
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
113
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
116 if note is None:
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
120 try:
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 if m:
133 encoding = m.group(1)
134 else:
135 encoding = 'utf-8'
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
138 try:
139 url = url_or_request.get_full_url()
140 except AttributeError:
141 url = url_or_request
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
146
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
154
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
158
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
162
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
168 return video_info
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
173 'url': url,
174 'ie_key': ie}
175 return video_info
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
179 'entries': entries}
180 if playlist_id:
181 video_info['id'] = playlist_id
182 if playlist_title:
183 video_info['title'] = playlist_title
184 return video_info
185
186
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
189
190 _VALID_URL = r"""^
191 (
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
202 v=
203 )
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
208 $"""
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
218 '13': '3gp',
219 '17': 'mp4',
220 '18': 'mp4',
221 '22': 'mp4',
222 '37': 'mp4',
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
224 '43': 'webm',
225 '44': 'webm',
226 '45': 'webm',
227 '46': 'webm',
228 }
229 _video_dimensions = {
230 '5': '240x400',
231 '6': '???',
232 '13': '???',
233 '17': '144x176',
234 '18': '360x640',
235 '22': '720x1280',
236 '34': '360x640',
237 '35': '480x854',
238 '37': '1080x1920',
239 '38': '3072x4096',
240 '43': '360x640',
241 '44': '480x854',
242 '45': '720x1280',
243 '46': '1080x1920',
244 }
245 IE_NAME = u'youtube'
246
247 @classmethod
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
252
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
256
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
260
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
264
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
268
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
272
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
276
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
281
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
285
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
289
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
293
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
297 try:
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
305 return sub_lang_list
306
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
310
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
312 """
313 Return tuple:
314 (error_message, sub_lang, sub)
315 """
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
318 'lang': sub_lang,
319 'name': sub_name,
320 'v': video_id,
321 'fmt': format,
322 })
323 url = 'http://www.youtube.com/api/timedtext?' + params
324 try:
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
328 if not sub:
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
331
332 def _extract_subtitle(self, video_id):
333 """
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
336 """
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
344 sub_lang = 'en'
345 else:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
349
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
351 return [subtitle]
352
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
358 subtitles = []
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
362 return subtitles
363
364 def _print_formats(self, formats):
365 print('Available formats:')
366 for x in formats:
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
368
369 def _real_initialize(self):
370 if self._downloader is None:
371 return
372
373 username = None
374 password = None
375 downloader_params = self._downloader.params
376
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
382 try:
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 if info is not None:
385 username = info[0]
386 password = info[2]
387 else:
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
391 return
392
393 # Set language
394 request = compat_urllib_request.Request(self._LANG_URL)
395 try:
396 self.report_lang()
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
400 return
401
402 # No authentication to be performed
403 if username is None:
404 return
405
406 request = compat_urllib_request.Request(self._LOGIN_URL)
407 try:
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411 return
412
413 galx = None
414 dsh = None
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
416 if match:
417 galx = match.group(1)
418
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
420 if match:
421 dsh = match.group(1)
422
423 # Log in
424 login_form_strs = {
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
426 u'Email': username,
427 u'GALX': galx,
428 u'Passwd': password,
429 u'PersistentCookie': u'yes',
430 u'_utf8': u'霱',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
434 u'dnConn': u'',
435 u'dsh': dsh,
436 u'pstMsg': u'0',
437 u'rmShown': u'1',
438 u'secTok': u'',
439 u'signIn': u'Sign in',
440 u'timeStmp': u'',
441 u'service': u'youtube',
442 u'uilel': u'3',
443 u'hl': u'en_US',
444 }
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
446 # chokes on unicode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
450 try:
451 self.report_login()
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
455 return
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
458 return
459
460 # Confirm age
461 age_form = {
462 'next_url': '/',
463 'action_confirm': 'Confirm',
464 }
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
466 try:
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
471 return
472
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
475 if mobj is None:
476 self._downloader.report_error(u'invalid URL: %s' % url)
477 return
478 video_id = mobj.group(2)
479 return video_id
480
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
484 if mobj:
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
487
488 # Get video webpage
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
492 try:
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
496 return
497
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
499
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
502 if mobj is not None:
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 else:
505 player_url = None
506
507 # Get video info
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
513 note=False,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
517 break
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
521 else:
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
523 return
524
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
528 return
529
530 # Start extracting information
531 self.report_information_extraction(video_id)
532
533 # uploader
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
536 return
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
538
539 # uploader_id
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 if mobj is not None:
543 video_uploader_id = mobj.group(1)
544 else:
545 self._downloader.report_warning(u'unable to extract uploader nickname')
546
547 # title
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
550 return
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552
553 # thumbnail image
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
556 video_thumbnail = ''
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
559
560 # upload date
561 upload_date = None
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
563 if mobj is not None:
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
566 for expression in format_expressions:
567 try:
568 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
569 except:
570 pass
571
572 # description
573 video_description = get_element_by_id("eow-description", video_webpage)
574 if video_description:
575 video_description = clean_html(video_description)
576 else:
577 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
578 if fd_mobj:
579 video_description = unescapeHTML(fd_mobj.group(1))
580 else:
581 video_description = u''
582
583 # subtitles
584 video_subtitles = None
585
586 if self._downloader.params.get('writesubtitles', False):
587 video_subtitles = self._extract_subtitle(video_id)
588 if video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitles[0]
590 if sub_error:
591 self._downloader.report_error(sub_error)
592
593 if self._downloader.params.get('allsubtitles', False):
594 video_subtitles = self._extract_all_subtitles(video_id)
595 for video_subtitle in video_subtitles:
596 (sub_error, sub_lang, sub) = video_subtitle
597 if sub_error:
598 self._downloader.report_error(sub_error)
599
600 if self._downloader.params.get('listsubtitles', False):
601 sub_lang_list = self._list_available_subtitles(video_id)
602 return
603
604 if 'length_seconds' not in video_info:
605 self._downloader.report_warning(u'unable to extract video duration')
606 video_duration = ''
607 else:
608 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
609
610 # token
611 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
612
613 # Decide which formats to download
614 req_format = self._downloader.params.get('format', None)
615
616 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617 self.report_rtmp_download()
618 video_url_list = [(None, video_info['conn'][0])]
619 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
624
625 format_limit = self._downloader.params.get('format_limit', None)
626 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627 if format_limit is not None and format_limit in available_formats:
628 format_list = available_formats[available_formats.index(format_limit):]
629 else:
630 format_list = available_formats
631 existing_formats = [x for x in format_list if x in url_map]
632 if len(existing_formats) == 0:
633 raise ExtractorError(u'no known formats available for video')
634 if self._downloader.params.get('listformats', None):
635 self._print_formats(existing_formats)
636 return
637 if req_format is None or req_format == 'best':
638 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639 elif req_format == 'worst':
640 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641 elif req_format in ('-1', 'all'):
642 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
643 else:
644 # Specific formats. We pick the first in a slash-delimeted sequence.
645 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646 req_formats = req_format.split('/')
647 video_url_list = None
648 for rf in req_formats:
649 if rf in url_map:
650 video_url_list = [(rf, url_map[rf])]
651 break
652 if video_url_list is None:
653 raise ExtractorError(u'requested format not available')
654 else:
655 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
656
657 results = []
658 for format_param, video_real_url in video_url_list:
659 # Extension
660 video_extension = self._video_extensions.get(format_param, 'flv')
661
662 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663 self._video_dimensions.get(format_param, '???'))
664
665 results.append({
666 'id': video_id,
667 'url': video_real_url,
668 'uploader': video_uploader,
669 'uploader_id': video_uploader_id,
670 'upload_date': upload_date,
671 'title': video_title,
672 'ext': video_extension,
673 'format': video_format,
674 'thumbnail': video_thumbnail,
675 'description': video_description,
676 'player_url': player_url,
677 'subtitles': video_subtitles,
678 'duration': video_duration
679 })
680 return results
681
682
683 class MetacafeIE(InfoExtractor):
684 """Information Extractor for metacafe.com."""
685
686 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689 IE_NAME = u'metacafe'
690
691 def report_disclaimer(self):
692 """Report disclaimer retrieval."""
693 self.to_screen(u'Retrieving disclaimer')
694
695 def _real_initialize(self):
696 # Retrieve disclaimer
697 request = compat_urllib_request.Request(self._DISCLAIMER)
698 try:
699 self.report_disclaimer()
700 disclaimer = compat_urllib_request.urlopen(request).read()
701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703 return
704
705 # Confirm age
706 disclaimer_form = {
707 'filters': '0',
708 'submit': "Continue - I'm over 18",
709 }
710 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
711 try:
712 self.report_age_confirmation()
713 disclaimer = compat_urllib_request.urlopen(request).read()
714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
716 return
717
718 def _real_extract(self, url):
719 # Extract id and simplified title from URL
720 mobj = re.match(self._VALID_URL, url)
721 if mobj is None:
722 self._downloader.report_error(u'invalid URL: %s' % url)
723 return
724
725 video_id = mobj.group(1)
726
727 # Check if video comes from YouTube
728 mobj2 = re.match(r'^yt-(.*)$', video_id)
729 if mobj2 is not None:
730 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
731
732 # Retrieve video webpage to extract further information
733 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
734
735 # Extract URL, uploader and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
738 if mobj is not None:
739 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740 video_extension = mediaURL[-3:]
741
742 # Extract gdaKey if available
743 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
744 if mobj is None:
745 video_url = mediaURL
746 else:
747 gdaKey = mobj.group(1)
748 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
749 else:
750 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
751 if mobj is None:
752 self._downloader.report_error(u'unable to extract media URL')
753 return
754 vardict = compat_parse_qs(mobj.group(1))
755 if 'mediaData' not in vardict:
756 self._downloader.report_error(u'unable to extract media URL')
757 return
758 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
759 if mobj is None:
760 self._downloader.report_error(u'unable to extract media URL')
761 return
762 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763 video_extension = mediaURL[-3:]
764 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
765
766 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
767 if mobj is None:
768 self._downloader.report_error(u'unable to extract title')
769 return
770 video_title = mobj.group(1).decode('utf-8')
771
772 mobj = re.search(r'submitter=(.*?);', webpage)
773 if mobj is None:
774 self._downloader.report_error(u'unable to extract uploader nickname')
775 return
776 video_uploader = mobj.group(1)
777
778 return [{
779 'id': video_id.decode('utf-8'),
780 'url': video_url.decode('utf-8'),
781 'uploader': video_uploader.decode('utf-8'),
782 'upload_date': None,
783 'title': video_title,
784 'ext': video_extension.decode('utf-8'),
785 }]
786
787
788 class DailymotionIE(InfoExtractor):
789 """Information Extractor for Dailymotion"""
790
791 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
792 IE_NAME = u'dailymotion'
793 _WORKING = False
794
795 def _real_extract(self, url):
796 # Extract id and simplified title from URL
797 mobj = re.match(self._VALID_URL, url)
798 if mobj is None:
799 self._downloader.report_error(u'invalid URL: %s' % url)
800 return
801
802 video_id = mobj.group(1).split('_')[0].split('?')[0]
803
804 video_extension = 'mp4'
805
806 # Retrieve video webpage to extract further information
807 request = compat_urllib_request.Request(url)
808 request.add_header('Cookie', 'family_filter=off')
809 webpage = self._download_webpage(request, video_id)
810
811 # Extract URL, uploader and title from webpage
812 self.report_extraction(video_id)
813 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
814 if mobj is None:
815 self._downloader.report_error(u'unable to extract media URL')
816 return
817 flashvars = compat_urllib_parse.unquote(mobj.group(1))
818
819 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
820 if key in flashvars:
821 max_quality = key
822 self.to_screen(u'Using %s' % key)
823 break
824 else:
825 self._downloader.report_error(u'unable to extract video URL')
826 return
827
828 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
829 if mobj is None:
830 self._downloader.report_error(u'unable to extract video URL')
831 return
832
833 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
834
835 # TODO: support choosing qualities
836
837 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
838 if mobj is None:
839 self._downloader.report_error(u'unable to extract title')
840 return
841 video_title = unescapeHTML(mobj.group('title'))
842
843 video_uploader = None
844 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
845 if mobj is None:
846 # lookin for official user
847 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
848 if mobj_official is None:
849 self._downloader.report_warning(u'unable to extract uploader nickname')
850 else:
851 video_uploader = mobj_official.group(1)
852 else:
853 video_uploader = mobj.group(1)
854
855 video_upload_date = None
856 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
857 if mobj is not None:
858 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
859
860 return [{
861 'id': video_id,
862 'url': video_url,
863 'uploader': video_uploader,
864 'upload_date': video_upload_date,
865 'title': video_title,
866 'ext': video_extension,
867 }]
868
869
870 class PhotobucketIE(InfoExtractor):
871 """Information extractor for photobucket.com."""
872
873 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
874 IE_NAME = u'photobucket'
875
876 def _real_extract(self, url):
877 # Extract id from URL
878 mobj = re.match(self._VALID_URL, url)
879 if mobj is None:
880 self._downloader.report_error(u'Invalid URL: %s' % url)
881 return
882
883 video_id = mobj.group(1)
884
885 video_extension = 'flv'
886
887 # Retrieve video webpage to extract further information
888 request = compat_urllib_request.Request(url)
889 try:
890 self.report_download_webpage(video_id)
891 webpage = compat_urllib_request.urlopen(request).read()
892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
893 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
894 return
895
896 # Extract URL, uploader, and title from webpage
897 self.report_extraction(video_id)
898 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
899 if mobj is None:
900 self._downloader.report_error(u'unable to extract media URL')
901 return
902 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
903
904 video_url = mediaURL
905
906 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
907 if mobj is None:
908 self._downloader.report_error(u'unable to extract title')
909 return
910 video_title = mobj.group(1).decode('utf-8')
911
912 video_uploader = mobj.group(2).decode('utf-8')
913
914 return [{
915 'id': video_id.decode('utf-8'),
916 'url': video_url.decode('utf-8'),
917 'uploader': video_uploader,
918 'upload_date': None,
919 'title': video_title,
920 'ext': video_extension.decode('utf-8'),
921 }]
922
923
924 class YahooIE(InfoExtractor):
925 """Information extractor for video.yahoo.com."""
926
927 _WORKING = False
928 # _VALID_URL matches all Yahoo! Video URLs
929 # _VPAGE_URL matches only the extractable '/watch/' URLs
930 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
931 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
932 IE_NAME = u'video.yahoo'
933
934 def _real_extract(self, url, new_video=True):
935 # Extract ID from URL
936 mobj = re.match(self._VALID_URL, url)
937 if mobj is None:
938 self._downloader.report_error(u'Invalid URL: %s' % url)
939 return
940
941 video_id = mobj.group(2)
942 video_extension = 'flv'
943
944 # Rewrite valid but non-extractable URLs as
945 # extractable English language /watch/ URLs
946 if re.match(self._VPAGE_URL, url) is None:
947 request = compat_urllib_request.Request(url)
948 try:
949 webpage = compat_urllib_request.urlopen(request).read()
950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
952 return
953
954 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
955 if mobj is None:
956 self._downloader.report_error(u'Unable to extract id field')
957 return
958 yahoo_id = mobj.group(1)
959
960 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
961 if mobj is None:
962 self._downloader.report_error(u'Unable to extract vid field')
963 return
964 yahoo_vid = mobj.group(1)
965
966 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
967 return self._real_extract(url, new_video=False)
968
969 # Retrieve video webpage to extract further information
970 request = compat_urllib_request.Request(url)
971 try:
972 self.report_download_webpage(video_id)
973 webpage = compat_urllib_request.urlopen(request).read()
974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
976 return
977
978 # Extract uploader and title from webpage
979 self.report_extraction(video_id)
980 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
981 if mobj is None:
982 self._downloader.report_error(u'unable to extract video title')
983 return
984 video_title = mobj.group(1).decode('utf-8')
985
986 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
987 if mobj is None:
988 self._downloader.report_error(u'unable to extract video uploader')
989 return
990 video_uploader = mobj.group(1).decode('utf-8')
991
992 # Extract video thumbnail
993 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
994 if mobj is None:
995 self._downloader.report_error(u'unable to extract video thumbnail')
996 return
997 video_thumbnail = mobj.group(1).decode('utf-8')
998
999 # Extract video description
1000 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1001 if mobj is None:
1002 self._downloader.report_error(u'unable to extract video description')
1003 return
1004 video_description = mobj.group(1).decode('utf-8')
1005 if not video_description:
1006 video_description = 'No description available.'
1007
1008 # Extract video height and width
1009 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1010 if mobj is None:
1011 self._downloader.report_error(u'unable to extract video height')
1012 return
1013 yv_video_height = mobj.group(1)
1014
1015 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1016 if mobj is None:
1017 self._downloader.report_error(u'unable to extract video width')
1018 return
1019 yv_video_width = mobj.group(1)
1020
1021 # Retrieve video playlist to extract media URL
1022 # I'm not completely sure what all these options are, but we
1023 # seem to need most of them, otherwise the server sends a 401.
1024 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1025 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1026 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1029 try:
1030 self.report_download_webpage(video_id)
1031 webpage = compat_urllib_request.urlopen(request).read()
1032 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034 return
1035
1036 # Extract media URL from playlist XML
1037 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1038 if mobj is None:
1039 self._downloader.report_error(u'Unable to extract media URL')
1040 return
1041 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042 video_url = unescapeHTML(video_url)
1043
1044 return [{
1045 'id': video_id.decode('utf-8'),
1046 'url': video_url,
1047 'uploader': video_uploader,
1048 'upload_date': None,
1049 'title': video_title,
1050 'ext': video_extension.decode('utf-8'),
1051 'thumbnail': video_thumbnail.decode('utf-8'),
1052 'description': video_description,
1053 }]
1054
1055
1056 class VimeoIE(InfoExtractor):
1057 """Information extractor for vimeo.com."""
1058
1059 # _VALID_URL matches Vimeo URLs
1060 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061 IE_NAME = u'vimeo'
1062
1063 def _real_extract(self, url, new_video=True):
1064 # Extract ID from URL
1065 mobj = re.match(self._VALID_URL, url)
1066 if mobj is None:
1067 self._downloader.report_error(u'Invalid URL: %s' % url)
1068 return
1069
1070 video_id = mobj.group('id')
1071 if not mobj.group('proto'):
1072 url = 'https://' + url
1073 if mobj.group('direct_link'):
1074 url = 'https://vimeo.com/' + video_id
1075
1076 # Retrieve video webpage to extract further information
1077 request = compat_urllib_request.Request(url, None, std_headers)
1078 try:
1079 self.report_download_webpage(video_id)
1080 webpage_bytes = compat_urllib_request.urlopen(request).read()
1081 webpage = webpage_bytes.decode('utf-8')
1082 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1084 return
1085
1086 # Now we begin extracting as much information as we can from what we
1087 # retrieved. First we extract the information common to all extractors,
1088 # and latter we extract those that are Vimeo specific.
1089 self.report_extraction(video_id)
1090
1091 # Extract the config JSON
1092 try:
1093 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094 config = json.loads(config)
1095 except:
1096 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1097 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1098 else:
1099 self._downloader.report_error(u'unable to extract info section')
1100 return
1101
1102 # Extract title
1103 video_title = config["video"]["title"]
1104
1105 # Extract uploader and uploader_id
1106 video_uploader = config["video"]["owner"]["name"]
1107 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1108
1109 # Extract video thumbnail
1110 video_thumbnail = config["video"]["thumbnail"]
1111
1112 # Extract video description
1113 video_description = get_element_by_attribute("itemprop", "description", webpage)
1114 if video_description: video_description = clean_html(video_description)
1115 else: video_description = u''
1116
1117 # Extract upload date
1118 video_upload_date = None
1119 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1120 if mobj is not None:
1121 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1122
1123 # Vimeo specific: extract request signature and timestamp
1124 sig = config['request']['signature']
1125 timestamp = config['request']['timestamp']
1126
1127 # Vimeo specific: extract video codec and quality information
1128 # First consider quality, then codecs, then take everything
1129 # TODO bind to format param
1130 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1131 files = { 'hd': [], 'sd': [], 'other': []}
1132 for codec_name, codec_extension in codecs:
1133 if codec_name in config["video"]["files"]:
1134 if 'hd' in config["video"]["files"][codec_name]:
1135 files['hd'].append((codec_name, codec_extension, 'hd'))
1136 elif 'sd' in config["video"]["files"][codec_name]:
1137 files['sd'].append((codec_name, codec_extension, 'sd'))
1138 else:
1139 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1140
1141 for quality in ('hd', 'sd', 'other'):
1142 if len(files[quality]) > 0:
1143 video_quality = files[quality][0][2]
1144 video_codec = files[quality][0][0]
1145 video_extension = files[quality][0][1]
1146 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1147 break
1148 else:
1149 self._downloader.report_error(u'no known codec found')
1150 return
1151
1152 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1153 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1154
1155 return [{
1156 'id': video_id,
1157 'url': video_url,
1158 'uploader': video_uploader,
1159 'uploader_id': video_uploader_id,
1160 'upload_date': video_upload_date,
1161 'title': video_title,
1162 'ext': video_extension,
1163 'thumbnail': video_thumbnail,
1164 'description': video_description,
1165 }]
1166
1167
1168 class ArteTvIE(InfoExtractor):
1169 """arte.tv information extractor."""
1170
1171 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1172 _LIVE_URL = r'index-[0-9]+\.html$'
1173
1174 IE_NAME = u'arte.tv'
1175
1176 def fetch_webpage(self, url):
1177 request = compat_urllib_request.Request(url)
1178 try:
1179 self.report_download_webpage(url)
1180 webpage = compat_urllib_request.urlopen(request).read()
1181 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1182 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1183 return
1184 except ValueError as err:
1185 self._downloader.report_error(u'Invalid URL: %s' % url)
1186 return
1187 return webpage
1188
1189 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1190 page = self.fetch_webpage(url)
1191 mobj = re.search(regex, page, regexFlags)
1192 info = {}
1193
1194 if mobj is None:
1195 self._downloader.report_error(u'Invalid URL: %s' % url)
1196 return
1197
1198 for (i, key, err) in matchTuples:
1199 if mobj.group(i) is None:
1200 self._downloader.report_error(err)
1201 return
1202 else:
1203 info[key] = mobj.group(i)
1204
1205 return info
1206
1207 def extractLiveStream(self, url):
1208 video_lang = url.split('/')[-4]
1209 info = self.grep_webpage(
1210 url,
1211 r'src="(.*?/videothek_js.*?\.js)',
1212 0,
1213 [
1214 (1, 'url', u'Invalid URL: %s' % url)
1215 ]
1216 )
1217 http_host = url.split('/')[2]
1218 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1219 info = self.grep_webpage(
1220 next_url,
1221 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1222 '(http://.*?\.swf).*?' +
1223 '(rtmp://.*?)\'',
1224 re.DOTALL,
1225 [
1226 (1, 'path', u'could not extract video path: %s' % url),
1227 (2, 'player', u'could not extract video player: %s' % url),
1228 (3, 'url', u'could not extract video url: %s' % url)
1229 ]
1230 )
1231 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1232
1233 def extractPlus7Stream(self, url):
1234 video_lang = url.split('/')[-3]
1235 info = self.grep_webpage(
1236 url,
1237 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1238 0,
1239 [
1240 (1, 'url', u'Invalid URL: %s' % url)
1241 ]
1242 )
1243 next_url = compat_urllib_parse.unquote(info.get('url'))
1244 info = self.grep_webpage(
1245 next_url,
1246 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1247 0,
1248 [
1249 (1, 'url', u'Could not find <video> tag: %s' % url)
1250 ]
1251 )
1252 next_url = compat_urllib_parse.unquote(info.get('url'))
1253
1254 info = self.grep_webpage(
1255 next_url,
1256 r'<video id="(.*?)".*?>.*?' +
1257 '<name>(.*?)</name>.*?' +
1258 '<dateVideo>(.*?)</dateVideo>.*?' +
1259 '<url quality="hd">(.*?)</url>',
1260 re.DOTALL,
1261 [
1262 (1, 'id', u'could not extract video id: %s' % url),
1263 (2, 'title', u'could not extract video title: %s' % url),
1264 (3, 'date', u'could not extract video date: %s' % url),
1265 (4, 'url', u'could not extract video url: %s' % url)
1266 ]
1267 )
1268
1269 return {
1270 'id': info.get('id'),
1271 'url': compat_urllib_parse.unquote(info.get('url')),
1272 'uploader': u'arte.tv',
1273 'upload_date': info.get('date'),
1274 'title': info.get('title').decode('utf-8'),
1275 'ext': u'mp4',
1276 'format': u'NA',
1277 'player_url': None,
1278 }
1279
1280 def _real_extract(self, url):
1281 video_id = url.split('/')[-1]
1282 self.report_extraction(video_id)
1283
1284 if re.search(self._LIVE_URL, video_id) is not None:
1285 self.extractLiveStream(url)
1286 return
1287 else:
1288 info = self.extractPlus7Stream(url)
1289
1290 return [info]
1291
1292
1293 class GenericIE(InfoExtractor):
1294 """Generic last-resort information extractor."""
1295
1296 _VALID_URL = r'.*'
1297 IE_NAME = u'generic'
1298
1299 def report_download_webpage(self, video_id):
1300 """Report webpage download."""
1301 if not self._downloader.params.get('test', False):
1302 self._downloader.report_warning(u'Falling back on generic information extractor.')
1303 super(GenericIE, self).report_download_webpage(video_id)
1304
1305 def report_following_redirect(self, new_url):
1306 """Report information extraction."""
1307 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308
1309 def _test_redirect(self, url):
1310 """Check if it is a redirect, like url shorteners, in case return the new url."""
1311 class HeadRequest(compat_urllib_request.Request):
1312 def get_method(self):
1313 return "HEAD"
1314
1315 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1316 """
1317 Subclass the HTTPRedirectHandler to make it use our
1318 HeadRequest also on the redirected URL
1319 """
1320 def redirect_request(self, req, fp, code, msg, headers, newurl):
1321 if code in (301, 302, 303, 307):
1322 newurl = newurl.replace(' ', '%20')
1323 newheaders = dict((k,v) for k,v in req.headers.items()
1324 if k.lower() not in ("content-length", "content-type"))
1325 return HeadRequest(newurl,
1326 headers=newheaders,
1327 origin_req_host=req.get_origin_req_host(),
1328 unverifiable=True)
1329 else:
1330 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331
1332 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1333 """
1334 Fallback to GET if HEAD is not allowed (405 HTTP error)
1335 """
1336 def http_error_405(self, req, fp, code, msg, headers):
1337 fp.read()
1338 fp.close()
1339
1340 newheaders = dict((k,v) for k,v in req.headers.items()
1341 if k.lower() not in ("content-length", "content-type"))
1342 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1343 headers=newheaders,
1344 origin_req_host=req.get_origin_req_host(),
1345 unverifiable=True))
1346
1347 # Build our opener
1348 opener = compat_urllib_request.OpenerDirector()
1349 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350 HTTPMethodFallback, HEADRedirectHandler,
1351 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352 opener.add_handler(handler())
1353
1354 response = opener.open(HeadRequest(url))
1355 new_url = response.geturl()
1356
1357 if url == new_url:
1358 return False
1359
1360 self.report_following_redirect(new_url)
1361 return new_url
1362
1363 def _real_extract(self, url):
1364 new_url = self._test_redirect(url)
1365 if new_url: return [self.url_result(new_url)]
1366
1367 video_id = url.split('/')[-1]
1368 try:
1369 webpage = self._download_webpage(url, video_id)
1370 except ValueError as err:
1371 # since this is the last-resort InfoExtractor, if
1372 # this error is thrown, it'll be thrown here
1373 self._downloader.report_error(u'Invalid URL: %s' % url)
1374 return
1375
1376 self.report_extraction(video_id)
1377 # Start with something easy: JW Player in SWFObject
1378 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1379 if mobj is None:
1380 # Broaden the search a little bit
1381 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1382 if mobj is None:
1383 # Broaden the search a little bit: JWPlayer JS loader
1384 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1385 if mobj is None:
1386 self._downloader.report_error(u'Invalid URL: %s' % url)
1387 return
1388
1389 # It's possible that one of the regexes
1390 # matched, but returned an empty group:
1391 if mobj.group(1) is None:
1392 self._downloader.report_error(u'Invalid URL: %s' % url)
1393 return
1394
1395 video_url = compat_urllib_parse.unquote(mobj.group(1))
1396 video_id = os.path.basename(video_url)
1397
1398 # here's a fun little line of code for you:
1399 video_extension = os.path.splitext(video_id)[1][1:]
1400 video_id = os.path.splitext(video_id)[0]
1401
1402 # it's tempting to parse this further, but you would
1403 # have to take into account all the variations like
1404 # Video Title - Site Name
1405 # Site Name | Video Title
1406 # Video Title - Tagline | Site Name
1407 # and so on and so forth; it's just not practical
1408 mobj = re.search(r'<title>(.*)</title>', webpage)
1409 if mobj is None:
1410 self._downloader.report_error(u'unable to extract title')
1411 return
1412 video_title = mobj.group(1)
1413
1414 # video uploader is domain name
1415 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416 if mobj is None:
1417 self._downloader.report_error(u'unable to extract title')
1418 return
1419 video_uploader = mobj.group(1)
1420
1421 return [{
1422 'id': video_id,
1423 'url': video_url,
1424 'uploader': video_uploader,
1425 'upload_date': None,
1426 'title': video_title,
1427 'ext': video_extension,
1428 }]
1429
1430
1431 class YoutubeSearchIE(InfoExtractor):
1432 """Information Extractor for YouTube search queries."""
1433 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435 _max_youtube_results = 1000
1436 IE_NAME = u'youtube:search'
1437
1438 def report_download_page(self, query, pagenum):
1439 """Report attempt to download search page with given number."""
1440 query = query.decode(preferredencoding())
1441 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1442
1443 def _real_extract(self, query):
1444 mobj = re.match(self._VALID_URL, query)
1445 if mobj is None:
1446 self._downloader.report_error(u'invalid search query "%s"' % query)
1447 return
1448
1449 prefix, query = query.split(':')
1450 prefix = prefix[8:]
1451 query = query.encode('utf-8')
1452 if prefix == '':
1453 return self._get_n_results(query, 1)
1454 elif prefix == 'all':
1455 self._get_n_results(query, self._max_youtube_results)
1456 else:
1457 try:
1458 n = int(prefix)
1459 if n <= 0:
1460 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1461 return
1462 elif n > self._max_youtube_results:
1463 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1464 n = self._max_youtube_results
1465 return self._get_n_results(query, n)
1466 except ValueError: # parsing prefix as integer fails
1467 return self._get_n_results(query, 1)
1468
1469 def _get_n_results(self, query, n):
1470 """Get a specified number of results for a query"""
1471
1472 video_ids = []
1473 pagenum = 0
1474 limit = n
1475
1476 while (50 * pagenum) < limit:
1477 self.report_download_page(query, pagenum+1)
1478 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479 request = compat_urllib_request.Request(result_url)
1480 try:
1481 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1484 return
1485 api_response = json.loads(data)['data']
1486
1487 if not 'items' in api_response:
1488 self._downloader.report_error(u'[youtube] No video results')
1489 return
1490
1491 new_ids = list(video['id'] for video in api_response['items'])
1492 video_ids += new_ids
1493
1494 limit = min(n, api_response['totalItems'])
1495 pagenum += 1
1496
1497 if len(video_ids) > n:
1498 video_ids = video_ids[:n]
1499 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1500 return videos
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504 """Information Extractor for Google Video search queries."""
1505 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509 _max_google_results = 1000
1510 IE_NAME = u'video.google:search'
1511
1512 def report_download_page(self, query, pagenum):
1513 """Report attempt to download playlist page with given number."""
1514 query = query.decode(preferredencoding())
1515 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1516
1517 def _real_extract(self, query):
1518 mobj = re.match(self._VALID_URL, query)
1519 if mobj is None:
1520 self._downloader.report_error(u'invalid search query "%s"' % query)
1521 return
1522
1523 prefix, query = query.split(':')
1524 prefix = prefix[8:]
1525 query = query.encode('utf-8')
1526 if prefix == '':
1527 self._download_n_results(query, 1)
1528 return
1529 elif prefix == 'all':
1530 self._download_n_results(query, self._max_google_results)
1531 return
1532 else:
1533 try:
1534 n = int(prefix)
1535 if n <= 0:
1536 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1537 return
1538 elif n > self._max_google_results:
1539 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540 n = self._max_google_results
1541 self._download_n_results(query, n)
1542 return
1543 except ValueError: # parsing prefix as integer fails
1544 self._download_n_results(query, 1)
1545 return
1546
1547 def _download_n_results(self, query, n):
1548 """Downloads a specified number of results for a query"""
1549
1550 video_ids = []
1551 pagenum = 0
1552
1553 while True:
1554 self.report_download_page(query, pagenum)
1555 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556 request = compat_urllib_request.Request(result_url)
1557 try:
1558 page = compat_urllib_request.urlopen(request).read()
1559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1561 return
1562
1563 # Extract video identifiers
1564 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565 video_id = mobj.group(1)
1566 if video_id not in video_ids:
1567 video_ids.append(video_id)
1568 if len(video_ids) == n:
1569 # Specified n videos reached
1570 for id in video_ids:
1571 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572 return
1573
1574 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575 for id in video_ids:
1576 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577 return
1578
1579 pagenum = pagenum + 1
1580
1581
1582 class YahooSearchIE(InfoExtractor):
1583 """Information Extractor for Yahoo! Video search queries."""
1584
1585 _WORKING = False
1586 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1587 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1588 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1589 _MORE_PAGES_INDICATOR = r'\s*Next'
1590 _max_yahoo_results = 1000
1591 IE_NAME = u'video.yahoo:search'
1592
1593 def report_download_page(self, query, pagenum):
1594 """Report attempt to download playlist page with given number."""
1595 query = query.decode(preferredencoding())
1596 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1597
1598 def _real_extract(self, query):
1599 mobj = re.match(self._VALID_URL, query)
1600 if mobj is None:
1601 self._downloader.report_error(u'invalid search query "%s"' % query)
1602 return
1603
1604 prefix, query = query.split(':')
1605 prefix = prefix[8:]
1606 query = query.encode('utf-8')
1607 if prefix == '':
1608 self._download_n_results(query, 1)
1609 return
1610 elif prefix == 'all':
1611 self._download_n_results(query, self._max_yahoo_results)
1612 return
1613 else:
1614 try:
1615 n = int(prefix)
1616 if n <= 0:
1617 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1618 return
1619 elif n > self._max_yahoo_results:
1620 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621 n = self._max_yahoo_results
1622 self._download_n_results(query, n)
1623 return
1624 except ValueError: # parsing prefix as integer fails
1625 self._download_n_results(query, 1)
1626 return
1627
1628 def _download_n_results(self, query, n):
1629 """Downloads a specified number of results for a query"""
1630
1631 video_ids = []
1632 already_seen = set()
1633 pagenum = 1
1634
1635 while True:
1636 self.report_download_page(query, pagenum)
1637 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638 request = compat_urllib_request.Request(result_url)
1639 try:
1640 page = compat_urllib_request.urlopen(request).read()
1641 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1643 return
1644
1645 # Extract video identifiers
1646 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647 video_id = mobj.group(1)
1648 if video_id not in already_seen:
1649 video_ids.append(video_id)
1650 already_seen.add(video_id)
1651 if len(video_ids) == n:
1652 # Specified n videos reached
1653 for id in video_ids:
1654 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655 return
1656
1657 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658 for id in video_ids:
1659 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660 return
1661
1662 pagenum = pagenum + 1
1663
1664
1665 class YoutubePlaylistIE(InfoExtractor):
1666 """Information Extractor for YouTube playlists."""
1667
1668 _VALID_URL = r"""(?:
1669 (?:https?://)?
1670 (?:\w+\.)?
1671 youtube\.com/
1672 (?:
1673 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1674 \? (?:.*?&)*? (?:p|a|list)=
1675 | p/
1676 )
1677 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1678 .*
1679 |
1680 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1681 )"""
1682 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1683 _MAX_RESULTS = 50
1684 IE_NAME = u'youtube:playlist'
1685
1686 @classmethod
1687 def suitable(cls, url):
1688 """Receives a URL and returns True if suitable for this IE."""
1689 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1690
1691 def report_download_page(self, playlist_id, pagenum):
1692 """Report attempt to download playlist page with given number."""
1693 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1694
1695 def _real_extract(self, url):
1696 # Extract playlist id
1697 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1698 if mobj is None:
1699 self._downloader.report_error(u'invalid url: %s' % url)
1700 return
1701
1702 # Download playlist videos from API
1703 playlist_id = mobj.group(1) or mobj.group(2)
1704 page_num = 1
1705 videos = []
1706
1707 while True:
1708 self.report_download_page(playlist_id, page_num)
1709
1710 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1711 try:
1712 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1713 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1715 return
1716
1717 try:
1718 response = json.loads(page)
1719 except ValueError as err:
1720 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1721 return
1722
1723 if 'feed' not in response:
1724 self._downloader.report_error(u'Got a malformed response from YouTube API')
1725 return
1726 if 'entry' not in response['feed']:
1727 # Number of videos is a multiple of self._MAX_RESULTS
1728 break
1729
1730 playlist_title = response['feed']['title']['$t']
1731
1732 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1733 for entry in response['feed']['entry']
1734 if 'content' in entry ]
1735
1736 if len(response['feed']['entry']) < self._MAX_RESULTS:
1737 break
1738 page_num += 1
1739
1740 videos = [v[1] for v in sorted(videos)]
1741
1742 url_results = [self.url_result(url, 'Youtube') for url in videos]
1743 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1744
1745
1746 class YoutubeChannelIE(InfoExtractor):
1747 """Information Extractor for YouTube channels."""
1748
1749 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1750 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1751 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1752 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1753 IE_NAME = u'youtube:channel'
1754
1755 def report_download_page(self, channel_id, pagenum):
1756 """Report attempt to download channel page with given number."""
1757 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1758
1759 def extract_videos_from_page(self, page):
1760 ids_in_page = []
1761 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1762 if mobj.group(1) not in ids_in_page:
1763 ids_in_page.append(mobj.group(1))
1764 return ids_in_page
1765
1766 def _real_extract(self, url):
1767 # Extract channel id
1768 mobj = re.match(self._VALID_URL, url)
1769 if mobj is None:
1770 self._downloader.report_error(u'invalid url: %s' % url)
1771 return
1772
1773 # Download channel page
1774 channel_id = mobj.group(1)
1775 video_ids = []
1776 pagenum = 1
1777
1778 self.report_download_page(channel_id, pagenum)
1779 url = self._TEMPLATE_URL % (channel_id, pagenum)
1780 request = compat_urllib_request.Request(url)
1781 try:
1782 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1783 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1784 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1785 return
1786
1787 # Extract video identifiers
1788 ids_in_page = self.extract_videos_from_page(page)
1789 video_ids.extend(ids_in_page)
1790
1791 # Download any subsequent channel pages using the json-based channel_ajax query
1792 if self._MORE_PAGES_INDICATOR in page:
1793 while True:
1794 pagenum = pagenum + 1
1795
1796 self.report_download_page(channel_id, pagenum)
1797 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1798 request = compat_urllib_request.Request(url)
1799 try:
1800 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1801 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1802 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1803 return
1804
1805 page = json.loads(page)
1806
1807 ids_in_page = self.extract_videos_from_page(page['content_html'])
1808 video_ids.extend(ids_in_page)
1809
1810 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1811 break
1812
1813 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1814
1815 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1816 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1817 return [self.playlist_result(url_entries, channel_id)]
1818
1819
1820 class YoutubeUserIE(InfoExtractor):
1821 """Information Extractor for YouTube users."""
1822
1823 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1824 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1825 _GDATA_PAGE_SIZE = 50
1826 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1827 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1828 IE_NAME = u'youtube:user'
1829
1830 def report_download_page(self, username, start_index):
1831 """Report attempt to download user page."""
1832 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1833 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1834
1835 def _real_extract(self, url):
1836 # Extract username
1837 mobj = re.match(self._VALID_URL, url)
1838 if mobj is None:
1839 self._downloader.report_error(u'invalid url: %s' % url)
1840 return
1841
1842 username = mobj.group(1)
1843
1844 # Download video ids using YouTube Data API. Result size per
1845 # query is limited (currently to 50 videos) so we need to query
1846 # page by page until there are no video ids - it means we got
1847 # all of them.
1848
1849 video_ids = []
1850 pagenum = 0
1851
1852 while True:
1853 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1854 self.report_download_page(username, start_index)
1855
1856 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1857
1858 try:
1859 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1862 return
1863
1864 # Extract video identifiers
1865 ids_in_page = []
1866
1867 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1868 if mobj.group(1) not in ids_in_page:
1869 ids_in_page.append(mobj.group(1))
1870
1871 video_ids.extend(ids_in_page)
1872
1873 # A little optimization - if current page is not
1874 # "full", ie. does not contain PAGE_SIZE video ids then
1875 # we can assume that this page is the last one - there
1876 # are no more ids on further pages - no need to query
1877 # again.
1878
1879 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1880 break
1881
1882 pagenum += 1
1883
1884 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1885 url_results = [self.url_result(url, 'Youtube') for url in urls]
1886 return [self.playlist_result(url_results, playlist_title = username)]
1887
1888
1889 class BlipTVUserIE(InfoExtractor):
1890 """Information Extractor for blip.tv users."""
1891
1892 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1893 _PAGE_SIZE = 12
1894 IE_NAME = u'blip.tv:user'
1895
1896 def report_download_page(self, username, pagenum):
1897 """Report attempt to download user page."""
1898 self.to_screen(u'user %s: Downloading video ids from page %d' %
1899 (username, pagenum))
1900
1901 def _real_extract(self, url):
1902 # Extract username
1903 mobj = re.match(self._VALID_URL, url)
1904 if mobj is None:
1905 self._downloader.report_error(u'invalid url: %s' % url)
1906 return
1907
1908 username = mobj.group(1)
1909
1910 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1911
1912 request = compat_urllib_request.Request(url)
1913
1914 try:
1915 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1916 mobj = re.search(r'data-users-id="([^"]+)"', page)
1917 page_base = page_base % mobj.group(1)
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1920 return
1921
1922
1923 # Download video ids using BlipTV Ajax calls. Result size per
1924 # query is limited (currently to 12 videos) so we need to query
1925 # page by page until there are no video ids - it means we got
1926 # all of them.
1927
1928 video_ids = []
1929 pagenum = 1
1930
1931 while True:
1932 self.report_download_page(username, pagenum)
1933 url = page_base + "&page=" + str(pagenum)
1934 request = compat_urllib_request.Request( url )
1935 try:
1936 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1937 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1939 return
1940
1941 # Extract video identifiers
1942 ids_in_page = []
1943
1944 for mobj in re.finditer(r'href="/([^"]+)"', page):
1945 if mobj.group(1) not in ids_in_page:
1946 ids_in_page.append(unescapeHTML(mobj.group(1)))
1947
1948 video_ids.extend(ids_in_page)
1949
1950 # A little optimization - if current page is not
1951 # "full", ie. does not contain PAGE_SIZE video ids then
1952 # we can assume that this page is the last one - there
1953 # are no more ids on further pages - no need to query
1954 # again.
1955
1956 if len(ids_in_page) < self._PAGE_SIZE:
1957 break
1958
1959 pagenum += 1
1960
1961 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1962 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1963 return [self.playlist_result(url_entries, playlist_title = username)]
1964
1965
1966 class DepositFilesIE(InfoExtractor):
1967 """Information extractor for depositfiles.com"""
1968
1969 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1970
1971 def _real_extract(self, url):
1972 file_id = url.split('/')[-1]
1973 # Rebuild url in english locale
1974 url = 'http://depositfiles.com/en/files/' + file_id
1975
1976 # Retrieve file webpage with 'Free download' button pressed
1977 free_download_indication = { 'gateway_result' : '1' }
1978 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1979 try:
1980 self.report_download_webpage(file_id)
1981 webpage = compat_urllib_request.urlopen(request).read()
1982 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1984 return
1985
1986 # Search for the real file URL
1987 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1988 if (mobj is None) or (mobj.group(1) is None):
1989 # Try to figure out reason of the error.
1990 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1991 if (mobj is not None) and (mobj.group(1) is not None):
1992 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1993 self._downloader.report_error(u'%s' % restriction_message)
1994 else:
1995 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1996 return
1997
1998 file_url = mobj.group(1)
1999 file_extension = os.path.splitext(file_url)[1][1:]
2000
2001 # Search for file title
2002 mobj = re.search(r'<b title="(.*?)">', webpage)
2003 if mobj is None:
2004 self._downloader.report_error(u'unable to extract title')
2005 return
2006 file_title = mobj.group(1).decode('utf-8')
2007
2008 return [{
2009 'id': file_id.decode('utf-8'),
2010 'url': file_url.decode('utf-8'),
2011 'uploader': None,
2012 'upload_date': None,
2013 'title': file_title,
2014 'ext': file_extension.decode('utf-8'),
2015 }]
2016
2017
2018 class FacebookIE(InfoExtractor):
2019 """Information Extractor for Facebook"""
2020
2021 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2022 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2023 _NETRC_MACHINE = 'facebook'
2024 IE_NAME = u'facebook'
2025
2026 def report_login(self):
2027 """Report attempt to log in."""
2028 self.to_screen(u'Logging in')
2029
2030 def _real_initialize(self):
2031 if self._downloader is None:
2032 return
2033
2034 useremail = None
2035 password = None
2036 downloader_params = self._downloader.params
2037
2038 # Attempt to use provided username and password or .netrc data
2039 if downloader_params.get('username', None) is not None:
2040 useremail = downloader_params['username']
2041 password = downloader_params['password']
2042 elif downloader_params.get('usenetrc', False):
2043 try:
2044 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2045 if info is not None:
2046 useremail = info[0]
2047 password = info[2]
2048 else:
2049 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2050 except (IOError, netrc.NetrcParseError) as err:
2051 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2052 return
2053
2054 if useremail is None:
2055 return
2056
2057 # Log in
2058 login_form = {
2059 'email': useremail,
2060 'pass': password,
2061 'login': 'Log+In'
2062 }
2063 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2064 try:
2065 self.report_login()
2066 login_results = compat_urllib_request.urlopen(request).read()
2067 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2068 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2069 return
2070 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2072 return
2073
2074 def _real_extract(self, url):
2075 mobj = re.match(self._VALID_URL, url)
2076 if mobj is None:
2077 self._downloader.report_error(u'invalid URL: %s' % url)
2078 return
2079 video_id = mobj.group('ID')
2080
2081 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2082 webpage = self._download_webpage(url, video_id)
2083
2084 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2085 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2086 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2087 if not m:
2088 raise ExtractorError(u'Cannot parse data')
2089 data = dict(json.loads(m.group(1)))
2090 params_raw = compat_urllib_parse.unquote(data['params'])
2091 params = json.loads(params_raw)
2092 video_data = params['video_data'][0]
2093 video_url = video_data.get('hd_src')
2094 if not video_url:
2095 video_url = video_data['sd_src']
2096 if not video_url:
2097 raise ExtractorError(u'Cannot find video URL')
2098 video_duration = int(video_data['video_duration'])
2099 thumbnail = video_data['thumbnail_src']
2100
2101 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2102 if not m:
2103 raise ExtractorError(u'Cannot find title in webpage')
2104 video_title = unescapeHTML(m.group(1))
2105
2106 info = {
2107 'id': video_id,
2108 'title': video_title,
2109 'url': video_url,
2110 'ext': 'mp4',
2111 'duration': video_duration,
2112 'thumbnail': thumbnail,
2113 }
2114 return [info]
2115
2116
2117 class BlipTVIE(InfoExtractor):
2118 """Information extractor for blip.tv"""
2119
2120 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2121 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2122 IE_NAME = u'blip.tv'
2123
2124 def report_direct_download(self, title):
2125 """Report information extraction."""
2126 self.to_screen(u'%s: Direct download detected' % title)
2127
2128 def _real_extract(self, url):
2129 mobj = re.match(self._VALID_URL, url)
2130 if mobj is None:
2131 self._downloader.report_error(u'invalid URL: %s' % url)
2132 return
2133
2134 urlp = compat_urllib_parse_urlparse(url)
2135 if urlp.path.startswith('/play/'):
2136 request = compat_urllib_request.Request(url)
2137 response = compat_urllib_request.urlopen(request)
2138 redirecturl = response.geturl()
2139 rurlp = compat_urllib_parse_urlparse(redirecturl)
2140 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2141 url = 'http://blip.tv/a/a-' + file_id
2142 return self._real_extract(url)
2143
2144
2145 if '?' in url:
2146 cchar = '&'
2147 else:
2148 cchar = '?'
2149 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2150 request = compat_urllib_request.Request(json_url)
2151 request.add_header('User-Agent', 'iTunes/10.6.1')
2152 self.report_extraction(mobj.group(1))
2153 info = None
2154 try:
2155 urlh = compat_urllib_request.urlopen(request)
2156 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2157 basename = url.split('/')[-1]
2158 title,ext = os.path.splitext(basename)
2159 title = title.decode('UTF-8')
2160 ext = ext.replace('.', '')
2161 self.report_direct_download(title)
2162 info = {
2163 'id': title,
2164 'url': url,
2165 'uploader': None,
2166 'upload_date': None,
2167 'title': title,
2168 'ext': ext,
2169 'urlhandle': urlh
2170 }
2171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2173 if info is None: # Regular URL
2174 try:
2175 json_code_bytes = urlh.read()
2176 json_code = json_code_bytes.decode('utf-8')
2177 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2178 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2179 return
2180
2181 try:
2182 json_data = json.loads(json_code)
2183 if 'Post' in json_data:
2184 data = json_data['Post']
2185 else:
2186 data = json_data
2187
2188 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2189 video_url = data['media']['url']
2190 umobj = re.match(self._URL_EXT, video_url)
2191 if umobj is None:
2192 raise ValueError('Can not determine filename extension')
2193 ext = umobj.group(1)
2194
2195 info = {
2196 'id': data['item_id'],
2197 'url': video_url,
2198 'uploader': data['display_name'],
2199 'upload_date': upload_date,
2200 'title': data['title'],
2201 'ext': ext,
2202 'format': data['media']['mimeType'],
2203 'thumbnail': data['thumbnailUrl'],
2204 'description': data['description'],
2205 'player_url': data['embedUrl'],
2206 'user_agent': 'iTunes/10.6.1',
2207 }
2208 except (ValueError,KeyError) as err:
2209 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2210 return
2211
2212 return [info]
2213
2214
2215 class MyVideoIE(InfoExtractor):
2216 """Information Extractor for myvideo.de."""
2217
2218 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2219 IE_NAME = u'myvideo'
2220
2221 def _real_extract(self,url):
2222 mobj = re.match(self._VALID_URL, url)
2223 if mobj is None:
2224 self._download.report_error(u'invalid URL: %s' % url)
2225 return
2226
2227 video_id = mobj.group(1)
2228
2229 # Get video webpage
2230 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2231 webpage = self._download_webpage(webpage_url, video_id)
2232
2233 self.report_extraction(video_id)
2234 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2235 webpage)
2236 if mobj is None:
2237 self._downloader.report_error(u'unable to extract media URL')
2238 return
2239 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2240
2241 mobj = re.search('<title>([^<]+)</title>', webpage)
2242 if mobj is None:
2243 self._downloader.report_error(u'unable to extract title')
2244 return
2245
2246 video_title = mobj.group(1)
2247
2248 return [{
2249 'id': video_id,
2250 'url': video_url,
2251 'uploader': None,
2252 'upload_date': None,
2253 'title': video_title,
2254 'ext': u'flv',
2255 }]
2256
2257 class ComedyCentralIE(InfoExtractor):
2258 """Information extractor for The Daily Show and Colbert Report """
2259
2260 # urls can be abbreviations like :thedailyshow or :colbert
2261 # urls for episodes like:
2262 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2263 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2264 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2265 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2266 |(https?://)?(www\.)?
2267 (?P<showname>thedailyshow|colbertnation)\.com/
2268 (full-episodes/(?P<episode>.*)|
2269 (?P<clip>
2270 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2271 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2272 $"""
2273
2274 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2275
2276 _video_extensions = {
2277 '3500': 'mp4',
2278 '2200': 'mp4',
2279 '1700': 'mp4',
2280 '1200': 'mp4',
2281 '750': 'mp4',
2282 '400': 'mp4',
2283 }
2284 _video_dimensions = {
2285 '3500': '1280x720',
2286 '2200': '960x540',
2287 '1700': '768x432',
2288 '1200': '640x360',
2289 '750': '512x288',
2290 '400': '384x216',
2291 }
2292
2293 @classmethod
2294 def suitable(cls, url):
2295 """Receives a URL and returns True if suitable for this IE."""
2296 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2297
2298 def report_config_download(self, episode_id, media_id):
2299 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2300
2301 def report_index_download(self, episode_id):
2302 self.to_screen(u'%s: Downloading show index' % episode_id)
2303
2304 def _print_formats(self, formats):
2305 print('Available formats:')
2306 for x in formats:
2307 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2308
2309
2310 def _real_extract(self, url):
2311 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2312 if mobj is None:
2313 self._downloader.report_error(u'invalid URL: %s' % url)
2314 return
2315
2316 if mobj.group('shortname'):
2317 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2318 url = u'http://www.thedailyshow.com/full-episodes/'
2319 else:
2320 url = u'http://www.colbertnation.com/full-episodes/'
2321 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2322 assert mobj is not None
2323
2324 if mobj.group('clip'):
2325 if mobj.group('showname') == 'thedailyshow':
2326 epTitle = mobj.group('tdstitle')
2327 else:
2328 epTitle = mobj.group('cntitle')
2329 dlNewest = False
2330 else:
2331 dlNewest = not mobj.group('episode')
2332 if dlNewest:
2333 epTitle = mobj.group('showname')
2334 else:
2335 epTitle = mobj.group('episode')
2336
2337 req = compat_urllib_request.Request(url)
2338 self.report_extraction(epTitle)
2339 try:
2340 htmlHandle = compat_urllib_request.urlopen(req)
2341 html = htmlHandle.read()
2342 webpage = html.decode('utf-8')
2343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2344 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2345 return
2346 if dlNewest:
2347 url = htmlHandle.geturl()
2348 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2349 if mobj is None:
2350 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2351 return
2352 if mobj.group('episode') == '':
2353 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2354 return
2355 epTitle = mobj.group('episode')
2356
2357 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2358
2359 if len(mMovieParams) == 0:
2360 # The Colbert Report embeds the information in a without
2361 # a URL prefix; so extract the alternate reference
2362 # and then add the URL prefix manually.
2363
2364 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2365 if len(altMovieParams) == 0:
2366 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2367 return
2368 else:
2369 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2370
2371 uri = mMovieParams[0][1]
2372 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2373 self.report_index_download(epTitle)
2374 try:
2375 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2377 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2378 return
2379
2380 results = []
2381
2382 idoc = xml.etree.ElementTree.fromstring(indexXml)
2383 itemEls = idoc.findall('.//item')
2384 for partNum,itemEl in enumerate(itemEls):
2385 mediaId = itemEl.findall('./guid')[0].text
2386 shortMediaId = mediaId.split(':')[-1]
2387 showId = mediaId.split(':')[-2].replace('.com', '')
2388 officialTitle = itemEl.findall('./title')[0].text
2389 officialDate = itemEl.findall('./pubDate')[0].text
2390
2391 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2392 compat_urllib_parse.urlencode({'uri': mediaId}))
2393 configReq = compat_urllib_request.Request(configUrl)
2394 self.report_config_download(epTitle, shortMediaId)
2395 try:
2396 configXml = compat_urllib_request.urlopen(configReq).read()
2397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2398 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2399 return
2400
2401 cdoc = xml.etree.ElementTree.fromstring(configXml)
2402 turls = []
2403 for rendition in cdoc.findall('.//rendition'):
2404 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2405 turls.append(finfo)
2406
2407 if len(turls) == 0:
2408 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2409 continue
2410
2411 if self._downloader.params.get('listformats', None):
2412 self._print_formats([i[0] for i in turls])
2413 return
2414
2415 # For now, just pick the highest bitrate
2416 format,rtmp_video_url = turls[-1]
2417
2418 # Get the format arg from the arg stream
2419 req_format = self._downloader.params.get('format', None)
2420
2421 # Select format if we can find one
2422 for f,v in turls:
2423 if f == req_format:
2424 format, rtmp_video_url = f, v
2425 break
2426
2427 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2428 if not m:
2429 raise ExtractorError(u'Cannot transform RTMP url')
2430 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2431 video_url = base + m.group('finalid')
2432
2433 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2434 info = {
2435 'id': shortMediaId,
2436 'url': video_url,
2437 'uploader': showId,
2438 'upload_date': officialDate,
2439 'title': effTitle,
2440 'ext': 'mp4',
2441 'format': format,
2442 'thumbnail': None,
2443 'description': officialTitle,
2444 }
2445 results.append(info)
2446
2447 return results
2448
2449
2450 class EscapistIE(InfoExtractor):
2451 """Information extractor for The Escapist """
2452
2453 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2454 IE_NAME = u'escapist'
2455
2456 def report_config_download(self, showName):
2457 self.to_screen(u'%s: Downloading configuration' % showName)
2458
2459 def _real_extract(self, url):
2460 mobj = re.match(self._VALID_URL, url)
2461 if mobj is None:
2462 self._downloader.report_error(u'invalid URL: %s' % url)
2463 return
2464 showName = mobj.group('showname')
2465 videoId = mobj.group('episode')
2466
2467 self.report_extraction(showName)
2468 try:
2469 webPage = compat_urllib_request.urlopen(url)
2470 webPageBytes = webPage.read()
2471 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2472 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2474 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2475 return
2476
2477 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2478 description = unescapeHTML(descMatch.group(1))
2479 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2480 imgUrl = unescapeHTML(imgMatch.group(1))
2481 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2482 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2483 configUrlMatch = re.search('config=(.*)$', playerUrl)
2484 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2485
2486 self.report_config_download(showName)
2487 try:
2488 configJSON = compat_urllib_request.urlopen(configUrl)
2489 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2490 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2493 return
2494
2495 # Technically, it's JavaScript, not JSON
2496 configJSON = configJSON.replace("'", '"')
2497
2498 try:
2499 config = json.loads(configJSON)
2500 except (ValueError,) as err:
2501 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2502 return
2503
2504 playlist = config['playlist']
2505 videoUrl = playlist[1]['url']
2506
2507 info = {
2508 'id': videoId,
2509 'url': videoUrl,
2510 'uploader': showName,
2511 'upload_date': None,
2512 'title': showName,
2513 'ext': 'mp4',
2514 'thumbnail': imgUrl,
2515 'description': description,
2516 'player_url': playerUrl,
2517 }
2518
2519 return [info]
2520
2521 class CollegeHumorIE(InfoExtractor):
2522 """Information extractor for collegehumor.com"""
2523
2524 _WORKING = False
2525 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2526 IE_NAME = u'collegehumor'
2527
2528 def report_manifest(self, video_id):
2529 """Report information extraction."""
2530 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2531
2532 def _real_extract(self, url):
2533 mobj = re.match(self._VALID_URL, url)
2534 if mobj is None:
2535 self._downloader.report_error(u'invalid URL: %s' % url)
2536 return
2537 video_id = mobj.group('videoid')
2538
2539 info = {
2540 'id': video_id,
2541 'uploader': None,
2542 'upload_date': None,
2543 }
2544
2545 self.report_extraction(video_id)
2546 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2547 try:
2548 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2551 return
2552
2553 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2554 try:
2555 videoNode = mdoc.findall('./video')[0]
2556 info['description'] = videoNode.findall('./description')[0].text
2557 info['title'] = videoNode.findall('./caption')[0].text
2558 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2559 manifest_url = videoNode.findall('./file')[0].text
2560 except IndexError:
2561 self._downloader.report_error(u'Invalid metadata XML file')
2562 return
2563
2564 manifest_url += '?hdcore=2.10.3'
2565 self.report_manifest(video_id)
2566 try:
2567 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2568 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2570 return
2571
2572 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2573 try:
2574 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2575 node_id = media_node.attrib['url']
2576 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2577 except IndexError as err:
2578 self._downloader.report_error(u'Invalid manifest file')
2579 return
2580
2581 url_pr = compat_urllib_parse_urlparse(manifest_url)
2582 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2583
2584 info['url'] = url
2585 info['ext'] = 'f4f'
2586 return [info]
2587
2588
2589 class XVideosIE(InfoExtractor):
2590 """Information extractor for xvideos.com"""
2591
2592 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2593 IE_NAME = u'xvideos'
2594
2595 def _real_extract(self, url):
2596 mobj = re.match(self._VALID_URL, url)
2597 if mobj is None:
2598 self._downloader.report_error(u'invalid URL: %s' % url)
2599 return
2600 video_id = mobj.group(1)
2601
2602 webpage = self._download_webpage(url, video_id)
2603
2604 self.report_extraction(video_id)
2605
2606
2607 # Extract video URL
2608 mobj = re.search(r'flv_url=(.+?)&', webpage)
2609 if mobj is None:
2610 self._downloader.report_error(u'unable to extract video url')
2611 return
2612 video_url = compat_urllib_parse.unquote(mobj.group(1))
2613
2614
2615 # Extract title
2616 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2617 if mobj is None:
2618 self._downloader.report_error(u'unable to extract video title')
2619 return
2620 video_title = mobj.group(1)
2621
2622
2623 # Extract video thumbnail
2624 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2625 if mobj is None:
2626 self._downloader.report_error(u'unable to extract video thumbnail')
2627 return
2628 video_thumbnail = mobj.group(0)
2629
2630 info = {
2631 'id': video_id,
2632 'url': video_url,
2633 'uploader': None,
2634 'upload_date': None,
2635 'title': video_title,
2636 'ext': 'flv',
2637 'thumbnail': video_thumbnail,
2638 'description': None,
2639 }
2640
2641 return [info]
2642
2643
2644 class SoundcloudIE(InfoExtractor):
2645 """Information extractor for soundcloud.com
2646 To access the media, the uid of the song and a stream token
2647 must be extracted from the page source and the script must make
2648 a request to media.soundcloud.com/crossdomain.xml. Then
2649 the media can be grabbed by requesting from an url composed
2650 of the stream token and uid
2651 """
2652
2653 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2654 IE_NAME = u'soundcloud'
2655
2656 def report_resolve(self, video_id):
2657 """Report information extraction."""
2658 self.to_screen(u'%s: Resolving id' % video_id)
2659
2660 def _real_extract(self, url):
2661 mobj = re.match(self._VALID_URL, url)
2662 if mobj is None:
2663 self._downloader.report_error(u'invalid URL: %s' % url)
2664 return
2665
2666 # extract uploader (which is in the url)
2667 uploader = mobj.group(1)
2668 # extract simple title (uploader + slug of song title)
2669 slug_title = mobj.group(2)
2670 simple_title = uploader + u'-' + slug_title
2671
2672 self.report_resolve('%s/%s' % (uploader, slug_title))
2673
2674 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2675 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2676 request = compat_urllib_request.Request(resolv_url)
2677 try:
2678 info_json_bytes = compat_urllib_request.urlopen(request).read()
2679 info_json = info_json_bytes.decode('utf-8')
2680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2681 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2682 return
2683
2684 info = json.loads(info_json)
2685 video_id = info['id']
2686 self.report_extraction('%s/%s' % (uploader, slug_title))
2687
2688 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2689 request = compat_urllib_request.Request(streams_url)
2690 try:
2691 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2692 stream_json = stream_json_bytes.decode('utf-8')
2693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2695 return
2696
2697 streams = json.loads(stream_json)
2698 mediaURL = streams['http_mp3_128_url']
2699
2700 return [{
2701 'id': info['id'],
2702 'url': mediaURL,
2703 'uploader': info['user']['username'],
2704 'upload_date': info['created_at'],
2705 'title': info['title'],
2706 'ext': u'mp3',
2707 'description': info['description'],
2708 }]
2709
2710 class SoundcloudSetIE(InfoExtractor):
2711 """Information extractor for soundcloud.com sets
2712 To access the media, the uid of the song and a stream token
2713 must be extracted from the page source and the script must make
2714 a request to media.soundcloud.com/crossdomain.xml. Then
2715 the media can be grabbed by requesting from an url composed
2716 of the stream token and uid
2717 """
2718
2719 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2720 IE_NAME = u'soundcloud:set'
2721
2722 def report_resolve(self, video_id):
2723 """Report information extraction."""
2724 self.to_screen(u'%s: Resolving id' % video_id)
2725
2726 def _real_extract(self, url):
2727 mobj = re.match(self._VALID_URL, url)
2728 if mobj is None:
2729 self._downloader.report_error(u'invalid URL: %s' % url)
2730 return
2731
2732 # extract uploader (which is in the url)
2733 uploader = mobj.group(1)
2734 # extract simple title (uploader + slug of song title)
2735 slug_title = mobj.group(2)
2736 simple_title = uploader + u'-' + slug_title
2737
2738 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2739
2740 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2741 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2742 request = compat_urllib_request.Request(resolv_url)
2743 try:
2744 info_json_bytes = compat_urllib_request.urlopen(request).read()
2745 info_json = info_json_bytes.decode('utf-8')
2746 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2748 return
2749
2750 videos = []
2751 info = json.loads(info_json)
2752 if 'errors' in info:
2753 for err in info['errors']:
2754 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2755 return
2756
2757 for track in info['tracks']:
2758 video_id = track['id']
2759 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2760
2761 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2762 request = compat_urllib_request.Request(streams_url)
2763 try:
2764 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2765 stream_json = stream_json_bytes.decode('utf-8')
2766 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2767 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2768 return
2769
2770 streams = json.loads(stream_json)
2771 mediaURL = streams['http_mp3_128_url']
2772
2773 videos.append({
2774 'id': video_id,
2775 'url': mediaURL,
2776 'uploader': track['user']['username'],
2777 'upload_date': track['created_at'],
2778 'title': track['title'],
2779 'ext': u'mp3',
2780 'description': track['description'],
2781 })
2782 return videos
2783
2784
2785 class InfoQIE(InfoExtractor):
2786 """Information extractor for infoq.com"""
2787 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2788
2789 def _real_extract(self, url):
2790 mobj = re.match(self._VALID_URL, url)
2791 if mobj is None:
2792 self._downloader.report_error(u'invalid URL: %s' % url)
2793 return
2794
2795 webpage = self._download_webpage(url, video_id=url)
2796 self.report_extraction(url)
2797
2798 # Extract video URL
2799 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2800 if mobj is None:
2801 self._downloader.report_error(u'unable to extract video url')
2802 return
2803 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2804 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2805
2806 # Extract title
2807 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2808 if mobj is None:
2809 self._downloader.report_error(u'unable to extract video title')
2810 return
2811 video_title = mobj.group(1)
2812
2813 # Extract description
2814 video_description = u'No description available.'
2815 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2816 if mobj is not None:
2817 video_description = mobj.group(1)
2818
2819 video_filename = video_url.split('/')[-1]
2820 video_id, extension = video_filename.split('.')
2821
2822 info = {
2823 'id': video_id,
2824 'url': video_url,
2825 'uploader': None,
2826 'upload_date': None,
2827 'title': video_title,
2828 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2829 'thumbnail': None,
2830 'description': video_description,
2831 }
2832
2833 return [info]
2834
2835 class MixcloudIE(InfoExtractor):
2836 """Information extractor for www.mixcloud.com"""
2837
2838 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2839 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2840 IE_NAME = u'mixcloud'
2841
2842 def report_download_json(self, file_id):
2843 """Report JSON download."""
2844 self.to_screen(u'Downloading json')
2845
2846 def get_urls(self, jsonData, fmt, bitrate='best'):
2847 """Get urls from 'audio_formats' section in json"""
2848 file_url = None
2849 try:
2850 bitrate_list = jsonData[fmt]
2851 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2852 bitrate = max(bitrate_list) # select highest
2853
2854 url_list = jsonData[fmt][bitrate]
2855 except TypeError: # we have no bitrate info.
2856 url_list = jsonData[fmt]
2857 return url_list
2858
2859 def check_urls(self, url_list):
2860 """Returns 1st active url from list"""
2861 for url in url_list:
2862 try:
2863 compat_urllib_request.urlopen(url)
2864 return url
2865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866 url = None
2867
2868 return None
2869
2870 def _print_formats(self, formats):
2871 print('Available formats:')
2872 for fmt in formats.keys():
2873 for b in formats[fmt]:
2874 try:
2875 ext = formats[fmt][b][0]
2876 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2877 except TypeError: # we have no bitrate info
2878 ext = formats[fmt][0]
2879 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2880 break
2881
2882 def _real_extract(self, url):
2883 mobj = re.match(self._VALID_URL, url)
2884 if mobj is None:
2885 self._downloader.report_error(u'invalid URL: %s' % url)
2886 return
2887 # extract uploader & filename from url
2888 uploader = mobj.group(1).decode('utf-8')
2889 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2890
2891 # construct API request
2892 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2893 # retrieve .json file with links to files
2894 request = compat_urllib_request.Request(file_url)
2895 try:
2896 self.report_download_json(file_url)
2897 jsonData = compat_urllib_request.urlopen(request).read()
2898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2899 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2900 return
2901
2902 # parse JSON
2903 json_data = json.loads(jsonData)
2904 player_url = json_data['player_swf_url']
2905 formats = dict(json_data['audio_formats'])
2906
2907 req_format = self._downloader.params.get('format', None)
2908 bitrate = None
2909
2910 if self._downloader.params.get('listformats', None):
2911 self._print_formats(formats)
2912 return
2913
2914 if req_format is None or req_format == 'best':
2915 for format_param in formats.keys():
2916 url_list = self.get_urls(formats, format_param)
2917 # check urls
2918 file_url = self.check_urls(url_list)
2919 if file_url is not None:
2920 break # got it!
2921 else:
2922 if req_format not in formats:
2923 self._downloader.report_error(u'format is not available')
2924 return
2925
2926 url_list = self.get_urls(formats, req_format)
2927 file_url = self.check_urls(url_list)
2928 format_param = req_format
2929
2930 return [{
2931 'id': file_id.decode('utf-8'),
2932 'url': file_url.decode('utf-8'),
2933 'uploader': uploader.decode('utf-8'),
2934 'upload_date': None,
2935 'title': json_data['name'],
2936 'ext': file_url.split('.')[-1].decode('utf-8'),
2937 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2938 'thumbnail': json_data['thumbnail_url'],
2939 'description': json_data['description'],
2940 'player_url': player_url.decode('utf-8'),
2941 }]
2942
2943 class StanfordOpenClassroomIE(InfoExtractor):
2944 """Information extractor for Stanford's Open ClassRoom"""
2945
2946 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2947 IE_NAME = u'stanfordoc'
2948
2949 def _real_extract(self, url):
2950 mobj = re.match(self._VALID_URL, url)
2951 if mobj is None:
2952 raise ExtractorError(u'Invalid URL: %s' % url)
2953
2954 if mobj.group('course') and mobj.group('video'): # A specific video
2955 course = mobj.group('course')
2956 video = mobj.group('video')
2957 info = {
2958 'id': course + '_' + video,
2959 'uploader': None,
2960 'upload_date': None,
2961 }
2962
2963 self.report_extraction(info['id'])
2964 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2965 xmlUrl = baseUrl + video + '.xml'
2966 try:
2967 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2968 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2969 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2970 return
2971 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2972 try:
2973 info['title'] = mdoc.findall('./title')[0].text
2974 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2975 except IndexError:
2976 self._downloader.report_error(u'Invalid metadata XML file')
2977 return
2978 info['ext'] = info['url'].rpartition('.')[2]
2979 return [info]
2980 elif mobj.group('course'): # A course page
2981 course = mobj.group('course')
2982 info = {
2983 'id': course,
2984 'type': 'playlist',
2985 'uploader': None,
2986 'upload_date': None,
2987 }
2988
2989 coursepage = self._download_webpage(url, info['id'],
2990 note='Downloading course info page',
2991 errnote='Unable to download course info page')
2992
2993 m = re.search('<h1>([^<]+)</h1>', coursepage)
2994 if m:
2995 info['title'] = unescapeHTML(m.group(1))
2996 else:
2997 info['title'] = info['id']
2998
2999 m = re.search('<description>([^<]+)</description>', coursepage)
3000 if m:
3001 info['description'] = unescapeHTML(m.group(1))
3002
3003 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3004 info['list'] = [
3005 {
3006 'type': 'reference',
3007 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3008 }
3009 for vpage in links]
3010 results = []
3011 for entry in info['list']:
3012 assert entry['type'] == 'reference'
3013 results += self.extract(entry['url'])
3014 return results
3015 else: # Root page
3016 info = {
3017 'id': 'Stanford OpenClassroom',
3018 'type': 'playlist',
3019 'uploader': None,
3020 'upload_date': None,
3021 }
3022
3023 self.report_download_webpage(info['id'])
3024 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3025 try:
3026 rootpage = compat_urllib_request.urlopen(rootURL).read()
3027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3029 return
3030
3031 info['title'] = info['id']
3032
3033 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3034 info['list'] = [
3035 {
3036 'type': 'reference',
3037 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3038 }
3039 for cpage in links]
3040
3041 results = []
3042 for entry in info['list']:
3043 assert entry['type'] == 'reference'
3044 results += self.extract(entry['url'])
3045 return results
3046
3047 class MTVIE(InfoExtractor):
3048 """Information extractor for MTV.com"""
3049
3050 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3051 IE_NAME = u'mtv'
3052
3053 def _real_extract(self, url):
3054 mobj = re.match(self._VALID_URL, url)
3055 if mobj is None:
3056 self._downloader.report_error(u'invalid URL: %s' % url)
3057 return
3058 if not mobj.group('proto'):
3059 url = 'http://' + url
3060 video_id = mobj.group('videoid')
3061
3062 webpage = self._download_webpage(url, video_id)
3063
3064 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3065 if mobj is None:
3066 self._downloader.report_error(u'unable to extract song name')
3067 return
3068 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3069 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3070 if mobj is None:
3071 self._downloader.report_error(u'unable to extract performer')
3072 return
3073 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3074 video_title = performer + ' - ' + song_name
3075
3076 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3077 if mobj is None:
3078 self._downloader.report_error(u'unable to mtvn_uri')
3079 return
3080 mtvn_uri = mobj.group(1)
3081
3082 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3083 if mobj is None:
3084 self._downloader.report_error(u'unable to extract content id')
3085 return
3086 content_id = mobj.group(1)
3087
3088 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3089 self.report_extraction(video_id)
3090 request = compat_urllib_request.Request(videogen_url)
3091 try:
3092 metadataXml = compat_urllib_request.urlopen(request).read()
3093 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3094 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3095 return
3096
3097 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3098 renditions = mdoc.findall('.//rendition')
3099
3100 # For now, always pick the highest quality.
3101 rendition = renditions[-1]
3102
3103 try:
3104 _,_,ext = rendition.attrib['type'].partition('/')
3105 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3106 video_url = rendition.find('./src').text
3107 except KeyError:
3108 self._downloader.report_error('Invalid rendition field.')
3109 return
3110
3111 info = {
3112 'id': video_id,
3113 'url': video_url,
3114 'uploader': performer,
3115 'upload_date': None,
3116 'title': video_title,
3117 'ext': ext,
3118 'format': format,
3119 }
3120
3121 return [info]
3122
3123
3124 class YoukuIE(InfoExtractor):
3125 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3126
3127 def _gen_sid(self):
3128 nowTime = int(time.time() * 1000)
3129 random1 = random.randint(1000,1998)
3130 random2 = random.randint(1000,9999)
3131
3132 return "%d%d%d" %(nowTime,random1,random2)
3133
3134 def _get_file_ID_mix_string(self, seed):
3135 mixed = []
3136 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3137 seed = float(seed)
3138 for i in range(len(source)):
3139 seed = (seed * 211 + 30031 ) % 65536
3140 index = math.floor(seed / 65536 * len(source) )
3141 mixed.append(source[int(index)])
3142 source.remove(source[int(index)])
3143 #return ''.join(mixed)
3144 return mixed
3145
3146 def _get_file_id(self, fileId, seed):
3147 mixed = self._get_file_ID_mix_string(seed)
3148 ids = fileId.split('*')
3149 realId = []
3150 for ch in ids:
3151 if ch:
3152 realId.append(mixed[int(ch)])
3153 return ''.join(realId)
3154
3155 def _real_extract(self, url):
3156 mobj = re.match(self._VALID_URL, url)
3157 if mobj is None:
3158 self._downloader.report_error(u'invalid URL: %s' % url)
3159 return
3160 video_id = mobj.group('ID')
3161
3162 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3163
3164 request = compat_urllib_request.Request(info_url, None, std_headers)
3165 try:
3166 self.report_download_webpage(video_id)
3167 jsondata = compat_urllib_request.urlopen(request).read()
3168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3170 return
3171
3172 self.report_extraction(video_id)
3173 try:
3174 jsonstr = jsondata.decode('utf-8')
3175 config = json.loads(jsonstr)
3176
3177 video_title = config['data'][0]['title']
3178 seed = config['data'][0]['seed']
3179
3180 format = self._downloader.params.get('format', None)
3181 supported_format = list(config['data'][0]['streamfileids'].keys())
3182
3183 if format is None or format == 'best':
3184 if 'hd2' in supported_format:
3185 format = 'hd2'
3186 else:
3187 format = 'flv'
3188 ext = u'flv'
3189 elif format == 'worst':
3190 format = 'mp4'
3191 ext = u'mp4'
3192 else:
3193 format = 'flv'
3194 ext = u'flv'
3195
3196
3197 fileid = config['data'][0]['streamfileids'][format]
3198 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3199 except (UnicodeDecodeError, ValueError, KeyError):
3200 self._downloader.report_error(u'unable to extract info section')
3201 return
3202
3203 files_info=[]
3204 sid = self._gen_sid()
3205 fileid = self._get_file_id(fileid, seed)
3206
3207 #column 8,9 of fileid represent the segment number
3208 #fileid[7:9] should be changed
3209 for index, key in enumerate(keys):
3210
3211 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3212 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3213
3214 info = {
3215 'id': '%s_part%02d' % (video_id, index),
3216 'url': download_url,
3217 'uploader': None,
3218 'upload_date': None,
3219 'title': video_title,
3220 'ext': ext,
3221 }
3222 files_info.append(info)
3223
3224 return files_info
3225
3226
3227 class XNXXIE(InfoExtractor):
3228 """Information extractor for xnxx.com"""
3229
3230 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3231 IE_NAME = u'xnxx'
3232 VIDEO_URL_RE = r'flv_url=(.*?)&'
3233 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3234 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3235
3236 def _real_extract(self, url):
3237 mobj = re.match(self._VALID_URL, url)
3238 if mobj is None:
3239 self._downloader.report_error(u'invalid URL: %s' % url)
3240 return
3241 video_id = mobj.group(1)
3242
3243 self.report_download_webpage(video_id)
3244
3245 # Get webpage content
3246 try:
3247 webpage_bytes = compat_urllib_request.urlopen(url).read()
3248 webpage = webpage_bytes.decode('utf-8')
3249 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3250 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3251 return
3252
3253 result = re.search(self.VIDEO_URL_RE, webpage)
3254 if result is None:
3255 self._downloader.report_error(u'unable to extract video url')
3256 return
3257 video_url = compat_urllib_parse.unquote(result.group(1))
3258
3259 result = re.search(self.VIDEO_TITLE_RE, webpage)
3260 if result is None:
3261 self._downloader.report_error(u'unable to extract video title')
3262 return
3263 video_title = result.group(1)
3264
3265 result = re.search(self.VIDEO_THUMB_RE, webpage)
3266 if result is None:
3267 self._downloader.report_error(u'unable to extract video thumbnail')
3268 return
3269 video_thumbnail = result.group(1)
3270
3271 return [{
3272 'id': video_id,
3273 'url': video_url,
3274 'uploader': None,
3275 'upload_date': None,
3276 'title': video_title,
3277 'ext': 'flv',
3278 'thumbnail': video_thumbnail,
3279 'description': None,
3280 }]
3281
3282
3283 class GooglePlusIE(InfoExtractor):
3284 """Information extractor for plus.google.com."""
3285
3286 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3287 IE_NAME = u'plus.google'
3288
3289 def report_extract_entry(self, url):
3290 """Report downloading extry"""
3291 self.to_screen(u'Downloading entry: %s' % url)
3292
3293 def report_date(self, upload_date):
3294 """Report downloading extry"""
3295 self.to_screen(u'Entry date: %s' % upload_date)
3296
3297 def report_uploader(self, uploader):
3298 """Report downloading extry"""
3299 self.to_screen(u'Uploader: %s' % uploader)
3300
3301 def report_title(self, video_title):
3302 """Report downloading extry"""
3303 self.to_screen(u'Title: %s' % video_title)
3304
3305 def report_extract_vid_page(self, video_page):
3306 """Report information extraction."""
3307 self.to_screen(u'Extracting video page: %s' % video_page)
3308
3309 def _real_extract(self, url):
3310 # Extract id from URL
3311 mobj = re.match(self._VALID_URL, url)
3312 if mobj is None:
3313 self._downloader.report_error(u'Invalid URL: %s' % url)
3314 return
3315
3316 post_url = mobj.group(0)
3317 video_id = mobj.group(1)
3318
3319 video_extension = 'flv'
3320
3321 # Step 1, Retrieve post webpage to extract further information
3322 self.report_extract_entry(post_url)
3323 request = compat_urllib_request.Request(post_url)
3324 try:
3325 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3328 return
3329
3330 # Extract update date
3331 upload_date = None
3332 pattern = 'title="Timestamp">(.*?)</a>'
3333 mobj = re.search(pattern, webpage)
3334 if mobj:
3335 upload_date = mobj.group(1)
3336 # Convert timestring to a format suitable for filename
3337 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3338 upload_date = upload_date.strftime('%Y%m%d')
3339 self.report_date(upload_date)
3340
3341 # Extract uploader
3342 uploader = None
3343 pattern = r'rel\="author".*?>(.*?)</a>'
3344 mobj = re.search(pattern, webpage)
3345 if mobj:
3346 uploader = mobj.group(1)
3347 self.report_uploader(uploader)
3348
3349 # Extract title
3350 # Get the first line for title
3351 video_title = u'NA'
3352 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3353 mobj = re.search(pattern, webpage)
3354 if mobj:
3355 video_title = mobj.group(1)
3356 self.report_title(video_title)
3357
3358 # Step 2, Stimulate clicking the image box to launch video
3359 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3360 mobj = re.search(pattern, webpage)
3361 if mobj is None:
3362 self._downloader.report_error(u'unable to extract video page URL')
3363
3364 video_page = mobj.group(1)
3365 request = compat_urllib_request.Request(video_page)
3366 try:
3367 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3369 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3370 return
3371 self.report_extract_vid_page(video_page)
3372
3373
3374 # Extract video links on video page
3375 """Extract video links of all sizes"""
3376 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3377 mobj = re.findall(pattern, webpage)
3378 if len(mobj) == 0:
3379 self._downloader.report_error(u'unable to extract video links')
3380
3381 # Sort in resolution
3382 links = sorted(mobj)
3383
3384 # Choose the lowest of the sort, i.e. highest resolution
3385 video_url = links[-1]
3386 # Only get the url. The resolution part in the tuple has no use anymore
3387 video_url = video_url[-1]
3388 # Treat escaped \u0026 style hex
3389 try:
3390 video_url = video_url.decode("unicode_escape")
3391 except AttributeError: # Python 3
3392 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3393
3394
3395 return [{
3396 'id': video_id,
3397 'url': video_url,
3398 'uploader': uploader,
3399 'upload_date': upload_date,
3400 'title': video_title,
3401 'ext': video_extension,
3402 }]
3403
3404 class NBAIE(InfoExtractor):
3405 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3406 IE_NAME = u'nba'
3407
3408 def _real_extract(self, url):
3409 mobj = re.match(self._VALID_URL, url)
3410 if mobj is None:
3411 self._downloader.report_error(u'invalid URL: %s' % url)
3412 return
3413
3414 video_id = mobj.group(1)
3415 if video_id.endswith('/index.html'):
3416 video_id = video_id[:-len('/index.html')]
3417
3418 webpage = self._download_webpage(url, video_id)
3419
3420 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3421 def _findProp(rexp, default=None):
3422 m = re.search(rexp, webpage)
3423 if m:
3424 return unescapeHTML(m.group(1))
3425 else:
3426 return default
3427
3428 shortened_video_id = video_id.rpartition('/')[2]
3429 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3430 info = {
3431 'id': shortened_video_id,
3432 'url': video_url,
3433 'ext': 'mp4',
3434 'title': title,
3435 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3436 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3437 }
3438 return [info]
3439
3440 class JustinTVIE(InfoExtractor):
3441 """Information extractor for justin.tv and twitch.tv"""
3442 # TODO: One broadcast may be split into multiple videos. The key
3443 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3444 # starts at 1 and increases. Can we treat all parts as one video?
3445
3446 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3447 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3448 _JUSTIN_PAGE_LIMIT = 100
3449 IE_NAME = u'justin.tv'
3450
3451 def report_download_page(self, channel, offset):
3452 """Report attempt to download a single page of videos."""
3453 self.to_screen(u'%s: Downloading video information from %d to %d' %
3454 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3455
3456 # Return count of items, list of *valid* items
3457 def _parse_page(self, url):
3458 try:
3459 urlh = compat_urllib_request.urlopen(url)
3460 webpage_bytes = urlh.read()
3461 webpage = webpage_bytes.decode('utf-8', 'ignore')
3462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3463 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3464 return
3465
3466 response = json.loads(webpage)
3467 if type(response) != list:
3468 error_text = response.get('error', 'unknown error')
3469 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3470 return
3471 info = []
3472 for clip in response:
3473 video_url = clip['video_file_url']
3474 if video_url:
3475 video_extension = os.path.splitext(video_url)[1][1:]
3476 video_date = re.sub('-', '', clip['start_time'][:10])
3477 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3478 video_id = clip['id']
3479 video_title = clip.get('title', video_id)
3480 info.append({
3481 'id': video_id,
3482 'url': video_url,
3483 'title': video_title,
3484 'uploader': clip.get('channel_name', video_uploader_id),
3485 'uploader_id': video_uploader_id,
3486 'upload_date': video_date,
3487 'ext': video_extension,
3488 })
3489 return (len(response), info)
3490
3491 def _real_extract(self, url):
3492 mobj = re.match(self._VALID_URL, url)
3493 if mobj is None:
3494 self._downloader.report_error(u'invalid URL: %s' % url)
3495 return
3496
3497 api = 'http://api.justin.tv'
3498 video_id = mobj.group(mobj.lastindex)
3499 paged = False
3500 if mobj.lastindex == 1:
3501 paged = True
3502 api += '/channel/archives/%s.json'
3503 else:
3504 api += '/broadcast/by_archive/%s.json'
3505 api = api % (video_id,)
3506
3507 self.report_extraction(video_id)
3508
3509 info = []
3510 offset = 0
3511 limit = self._JUSTIN_PAGE_LIMIT
3512 while True:
3513 if paged:
3514 self.report_download_page(video_id, offset)
3515 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3516 page_count, page_info = self._parse_page(page_url)
3517 info.extend(page_info)
3518 if not paged or page_count != limit:
3519 break
3520 offset += limit
3521 return info
3522
3523 class FunnyOrDieIE(InfoExtractor):
3524 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3525
3526 def _real_extract(self, url):
3527 mobj = re.match(self._VALID_URL, url)
3528 if mobj is None:
3529 self._downloader.report_error(u'invalid URL: %s' % url)
3530 return
3531
3532 video_id = mobj.group('id')
3533 webpage = self._download_webpage(url, video_id)
3534
3535 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3536 if not m:
3537 self._downloader.report_error(u'unable to find video information')
3538 video_url = unescapeHTML(m.group('url'))
3539
3540 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3541 if not m:
3542 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3543 if not m:
3544 self._downloader.report_error(u'Cannot find video title')
3545 title = clean_html(m.group('title'))
3546
3547 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3548 if m:
3549 desc = unescapeHTML(m.group('desc'))
3550 else:
3551 desc = None
3552
3553 info = {
3554 'id': video_id,
3555 'url': video_url,
3556 'ext': 'mp4',
3557 'title': title,
3558 'description': desc,
3559 }
3560 return [info]
3561
3562 class SteamIE(InfoExtractor):
3563 _VALID_URL = r"""http://store.steampowered.com/
3564 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3565 (?P<gameID>\d+)/?
3566 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3567 """
3568
3569 @classmethod
3570 def suitable(cls, url):
3571 """Receives a URL and returns True if suitable for this IE."""
3572 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3573
3574 def _real_extract(self, url):
3575 m = re.match(self._VALID_URL, url, re.VERBOSE)
3576 gameID = m.group('gameID')
3577 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3578 self.report_age_confirmation()
3579 webpage = self._download_webpage(videourl, gameID)
3580 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3581
3582 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3583 mweb = re.finditer(urlRE, webpage)
3584 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3585 titles = re.finditer(namesRE, webpage)
3586 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3587 thumbs = re.finditer(thumbsRE, webpage)
3588 videos = []
3589 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3590 video_id = vid.group('videoID')
3591 title = vtitle.group('videoName')
3592 video_url = vid.group('videoURL')
3593 video_thumb = thumb.group('thumbnail')
3594 if not video_url:
3595 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3596 info = {
3597 'id':video_id,
3598 'url':video_url,
3599 'ext': 'flv',
3600 'title': unescapeHTML(title),
3601 'thumbnail': video_thumb
3602 }
3603 videos.append(info)
3604 return [self.playlist_result(videos, gameID, game_title)]
3605
3606 class UstreamIE(InfoExtractor):
3607 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3608 IE_NAME = u'ustream'
3609
3610 def _real_extract(self, url):
3611 m = re.match(self._VALID_URL, url)
3612 video_id = m.group('videoID')
3613 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3614 webpage = self._download_webpage(url, video_id)
3615 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3616 title = m.group('title')
3617 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3618 uploader = m.group('uploader')
3619 info = {
3620 'id':video_id,
3621 'url':video_url,
3622 'ext': 'flv',
3623 'title': title,
3624 'uploader': uploader
3625 }
3626 return [info]
3627
3628 class WorldStarHipHopIE(InfoExtractor):
3629 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3630 IE_NAME = u'WorldStarHipHop'
3631
3632 def _real_extract(self, url):
3633 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3634
3635 webpage_src = compat_urllib_request.urlopen(url).read()
3636 webpage_src = webpage_src.decode('utf-8')
3637
3638 mobj = re.search(_src_url, webpage_src)
3639
3640 m = re.match(self._VALID_URL, url)
3641 video_id = m.group('id')
3642
3643 if mobj is not None:
3644 video_url = mobj.group()
3645 if 'mp4' in video_url:
3646 ext = 'mp4'
3647 else:
3648 ext = 'flv'
3649 else:
3650 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3651 return
3652
3653 _title = r"""<title>(.*)</title>"""
3654
3655 mobj = re.search(_title, webpage_src)
3656
3657 if mobj is not None:
3658 title = mobj.group(1)
3659 else:
3660 title = 'World Start Hip Hop - %s' % time.ctime()
3661
3662 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3663 mobj = re.search(_thumbnail, webpage_src)
3664
3665 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3666 if mobj is not None:
3667 thumbnail = mobj.group(1)
3668 else:
3669 _title = r"""candytitles.*>(.*)</span>"""
3670 mobj = re.search(_title, webpage_src)
3671 if mobj is not None:
3672 title = mobj.group(1)
3673 thumbnail = None
3674
3675 results = [{
3676 'id': video_id,
3677 'url' : video_url,
3678 'title' : title,
3679 'thumbnail' : thumbnail,
3680 'ext' : ext,
3681 }]
3682 return results
3683
3684 class RBMARadioIE(InfoExtractor):
3685 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3686
3687 def _real_extract(self, url):
3688 m = re.match(self._VALID_URL, url)
3689 video_id = m.group('videoID')
3690
3691 webpage = self._download_webpage(url, video_id)
3692 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3693 if not m:
3694 raise ExtractorError(u'Cannot find metadata')
3695 json_data = m.group(1)
3696
3697 try:
3698 data = json.loads(json_data)
3699 except ValueError as e:
3700 raise ExtractorError(u'Invalid JSON: ' + str(e))
3701
3702 video_url = data['akamai_url'] + '&cbr=256'
3703 url_parts = compat_urllib_parse_urlparse(video_url)
3704 video_ext = url_parts.path.rpartition('.')[2]
3705 info = {
3706 'id': video_id,
3707 'url': video_url,
3708 'ext': video_ext,
3709 'title': data['title'],
3710 'description': data.get('teaser_text'),
3711 'location': data.get('country_of_origin'),
3712 'uploader': data.get('host', {}).get('name'),
3713 'uploader_id': data.get('host', {}).get('slug'),
3714 'thumbnail': data.get('image', {}).get('large_url_2x'),
3715 'duration': data.get('duration'),
3716 }
3717 return [info]
3718
3719
3720 class YouPornIE(InfoExtractor):
3721 """Information extractor for youporn.com."""
3722 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3723
3724 def _print_formats(self, formats):
3725 """Print all available formats"""
3726 print(u'Available formats:')
3727 print(u'ext\t\tformat')
3728 print(u'---------------------------------')
3729 for format in formats:
3730 print(u'%s\t\t%s' % (format['ext'], format['format']))
3731
3732 def _specific(self, req_format, formats):
3733 for x in formats:
3734 if(x["format"]==req_format):
3735 return x
3736 return None
3737
3738 def _real_extract(self, url):
3739 mobj = re.match(self._VALID_URL, url)
3740 if mobj is None:
3741 self._downloader.report_error(u'invalid URL: %s' % url)
3742 return
3743
3744 video_id = mobj.group('videoid')
3745
3746 req = compat_urllib_request.Request(url)
3747 req.add_header('Cookie', 'age_verified=1')
3748 webpage = self._download_webpage(req, video_id)
3749
3750 # Get the video title
3751 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3752 if result is None:
3753 raise ExtractorError(u'Unable to extract video title')
3754 video_title = result.group('title').strip()
3755
3756 # Get the video date
3757 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3758 if result is None:
3759 self._downloader.report_warning(u'unable to extract video date')
3760 upload_date = None
3761 else:
3762 upload_date = result.group('date').strip()
3763
3764 # Get the video uploader
3765 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3766 if result is None:
3767 self._downloader.report_warning(u'unable to extract uploader')
3768 video_uploader = None
3769 else:
3770 video_uploader = result.group('uploader').strip()
3771 video_uploader = clean_html( video_uploader )
3772
3773 # Get all of the formats available
3774 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3775 result = re.search(DOWNLOAD_LIST_RE, webpage)
3776 if result is None:
3777 raise ExtractorError(u'Unable to extract download list')
3778 download_list_html = result.group('download_list').strip()
3779
3780 # Get all of the links from the page
3781 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3782 links = re.findall(LINK_RE, download_list_html)
3783 if(len(links) == 0):
3784 raise ExtractorError(u'ERROR: no known formats available for video')
3785
3786 self.to_screen(u'Links found: %d' % len(links))
3787
3788 formats = []
3789 for link in links:
3790
3791 # A link looks like this:
3792 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3793 # A path looks like this:
3794 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3795 video_url = unescapeHTML( link )
3796 path = compat_urllib_parse_urlparse( video_url ).path
3797 extension = os.path.splitext( path )[1][1:]
3798 format = path.split('/')[4].split('_')[:2]
3799 size = format[0]
3800 bitrate = format[1]
3801 format = "-".join( format )
3802 title = u'%s-%s-%s' % (video_title, size, bitrate)
3803
3804 formats.append({
3805 'id': video_id,
3806 'url': video_url,
3807 'uploader': video_uploader,
3808 'upload_date': upload_date,
3809 'title': title,
3810 'ext': extension,
3811 'format': format,
3812 'thumbnail': None,
3813 'description': None,
3814 'player_url': None
3815 })
3816
3817 if self._downloader.params.get('listformats', None):
3818 self._print_formats(formats)
3819 return
3820
3821 req_format = self._downloader.params.get('format', None)
3822 self.to_screen(u'Format: %s' % req_format)
3823
3824 if req_format is None or req_format == 'best':
3825 return [formats[0]]
3826 elif req_format == 'worst':
3827 return [formats[-1]]
3828 elif req_format in ('-1', 'all'):
3829 return formats
3830 else:
3831 format = self._specific( req_format, formats )
3832 if result is None:
3833 self._downloader.report_error(u'requested format not available')
3834 return
3835 return [format]
3836
3837
3838
3839 class PornotubeIE(InfoExtractor):
3840 """Information extractor for pornotube.com."""
3841 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3842
3843 def _real_extract(self, url):
3844 mobj = re.match(self._VALID_URL, url)
3845 if mobj is None:
3846 self._downloader.report_error(u'invalid URL: %s' % url)
3847 return
3848
3849 video_id = mobj.group('videoid')
3850 video_title = mobj.group('title')
3851
3852 # Get webpage content
3853 webpage = self._download_webpage(url, video_id)
3854
3855 # Get the video URL
3856 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3857 result = re.search(VIDEO_URL_RE, webpage)
3858 if result is None:
3859 self._downloader.report_error(u'unable to extract video url')
3860 return
3861 video_url = compat_urllib_parse.unquote(result.group('url'))
3862
3863 #Get the uploaded date
3864 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3865 result = re.search(VIDEO_UPLOADED_RE, webpage)
3866 if result is None:
3867 self._downloader.report_error(u'unable to extract video title')
3868 return
3869 upload_date = result.group('date')
3870
3871 info = {'id': video_id,
3872 'url': video_url,
3873 'uploader': None,
3874 'upload_date': upload_date,
3875 'title': video_title,
3876 'ext': 'flv',
3877 'format': 'flv'}
3878
3879 return [info]
3880
3881 class YouJizzIE(InfoExtractor):
3882 """Information extractor for youjizz.com."""
3883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3884
3885 def _real_extract(self, url):
3886 mobj = re.match(self._VALID_URL, url)
3887 if mobj is None:
3888 self._downloader.report_error(u'invalid URL: %s' % url)
3889 return
3890
3891 video_id = mobj.group('videoid')
3892
3893 # Get webpage content
3894 webpage = self._download_webpage(url, video_id)
3895
3896 # Get the video title
3897 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3898 if result is None:
3899 raise ExtractorError(u'ERROR: unable to extract video title')
3900 video_title = result.group('title').strip()
3901
3902 # Get the embed page
3903 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3904 if result is None:
3905 raise ExtractorError(u'ERROR: unable to extract embed page')
3906
3907 embed_page_url = result.group(0).strip()
3908 video_id = result.group('videoid')
3909
3910 webpage = self._download_webpage(embed_page_url, video_id)
3911
3912 # Get the video URL
3913 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3914 if result is None:
3915 raise ExtractorError(u'ERROR: unable to extract video url')
3916 video_url = result.group('source')
3917
3918 info = {'id': video_id,
3919 'url': video_url,
3920 'title': video_title,
3921 'ext': 'flv',
3922 'format': 'flv',
3923 'player_url': embed_page_url}
3924
3925 return [info]
3926
3927 class EightTracksIE(InfoExtractor):
3928 IE_NAME = '8tracks'
3929 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3930
3931 def _real_extract(self, url):
3932 mobj = re.match(self._VALID_URL, url)
3933 if mobj is None:
3934 raise ExtractorError(u'Invalid URL: %s' % url)
3935 playlist_id = mobj.group('id')
3936
3937 webpage = self._download_webpage(url, playlist_id)
3938
3939 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3940 if not m:
3941 raise ExtractorError(u'Cannot find trax information')
3942 json_like = m.group(1)
3943 data = json.loads(json_like)
3944
3945 session = str(random.randint(0, 1000000000))
3946 mix_id = data['id']
3947 track_count = data['tracks_count']
3948 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3949 next_url = first_url
3950 res = []
3951 for i in itertools.count():
3952 api_json = self._download_webpage(next_url, playlist_id,
3953 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3954 errnote=u'Failed to download song information')
3955 api_data = json.loads(api_json)
3956 track_data = api_data[u'set']['track']
3957 info = {
3958 'id': track_data['id'],
3959 'url': track_data['track_file_stream_url'],
3960 'title': track_data['performer'] + u' - ' + track_data['name'],
3961 'raw_title': track_data['name'],
3962 'uploader_id': data['user']['login'],
3963 'ext': 'm4a',
3964 }
3965 res.append(info)
3966 if api_data['set']['at_last_track']:
3967 break
3968 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3969 return res
3970
3971 class KeekIE(InfoExtractor):
3972 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3973 IE_NAME = u'keek'
3974
3975 def _real_extract(self, url):
3976 m = re.match(self._VALID_URL, url)
3977 video_id = m.group('videoID')
3978 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3979 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3980 webpage = self._download_webpage(url, video_id)
3981 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3982 title = unescapeHTML(m.group('title'))
3983 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3984 uploader = clean_html(m.group('uploader'))
3985 info = {
3986 'id': video_id,
3987 'url': video_url,
3988 'ext': 'mp4',
3989 'title': title,
3990 'thumbnail': thumbnail,
3991 'uploader': uploader
3992 }
3993 return [info]
3994
3995 class TEDIE(InfoExtractor):
3996 _VALID_URL=r'''http://www.ted.com/
3997 (
3998 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3999 |
4000 ((?P<type_talk>talks)) # We have a simple talk
4001 )
4002 /(?P<name>\w+) # Here goes the name and then ".html"
4003 '''
4004
4005 @classmethod
4006 def suitable(cls, url):
4007 """Receives a URL and returns True if suitable for this IE."""
4008 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4009
4010 def _real_extract(self, url):
4011 m=re.match(self._VALID_URL, url, re.VERBOSE)
4012 if m.group('type_talk'):
4013 return [self._talk_info(url)]
4014 else :
4015 playlist_id=m.group('playlist_id')
4016 name=m.group('name')
4017 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4018 return [self._playlist_videos_info(url,name,playlist_id)]
4019
4020 def _talk_video_link(self,mediaSlug):
4021 '''Returns the video link for that mediaSlug'''
4022 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4023
4024 def _playlist_videos_info(self,url,name,playlist_id=0):
4025 '''Returns the videos of the playlist'''
4026 video_RE=r'''
4027 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4028 ([.\s]*?)data-playlist_item_id="(\d+)"
4029 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4030 '''
4031 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4032 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4033 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4034 m_names=re.finditer(video_name_RE,webpage)
4035
4036 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4037 m_playlist = re.search(playlist_RE, webpage)
4038 playlist_title = m_playlist.group('playlist_title')
4039
4040 playlist_entries = []
4041 for m_video, m_name in zip(m_videos,m_names):
4042 video_id=m_video.group('video_id')
4043 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4044 playlist_entries.append(self.url_result(talk_url, 'TED'))
4045 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4046
4047 def _talk_info(self, url, video_id=0):
4048 """Return the video for the talk in the url"""
4049 m=re.match(self._VALID_URL, url,re.VERBOSE)
4050 videoName=m.group('name')
4051 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4052 # If the url includes the language we get the title translated
4053 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4054 title=re.search(title_RE, webpage).group('title')
4055 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4056 "id":(?P<videoID>[\d]+).*?
4057 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4058 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4059 thumb_match=re.search(thumb_RE,webpage)
4060 info_match=re.search(info_RE,webpage,re.VERBOSE)
4061 video_id=info_match.group('videoID')
4062 mediaSlug=info_match.group('mediaSlug')
4063 video_url=self._talk_video_link(mediaSlug)
4064 info = {
4065 'id': video_id,
4066 'url': video_url,
4067 'ext': 'mp4',
4068 'title': title,
4069 'thumbnail': thumb_match.group('thumbnail')
4070 }
4071 return info
4072
4073 class MySpassIE(InfoExtractor):
4074 _VALID_URL = r'http://www.myspass.de/.*'
4075
4076 def _real_extract(self, url):
4077 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4078
4079 # video id is the last path element of the URL
4080 # usually there is a trailing slash, so also try the second but last
4081 url_path = compat_urllib_parse_urlparse(url).path
4082 url_parent_path, video_id = os.path.split(url_path)
4083 if not video_id:
4084 _, video_id = os.path.split(url_parent_path)
4085
4086 # get metadata
4087 metadata_url = META_DATA_URL_TEMPLATE % video_id
4088 metadata_text = self._download_webpage(metadata_url, video_id)
4089 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4090
4091 # extract values from metadata
4092 url_flv_el = metadata.find('url_flv')
4093 if url_flv_el is None:
4094 self._downloader.report_error(u'unable to extract download url')
4095 return
4096 video_url = url_flv_el.text
4097 extension = os.path.splitext(video_url)[1][1:]
4098 title_el = metadata.find('title')
4099 if title_el is None:
4100 self._downloader.report_error(u'unable to extract title')
4101 return
4102 title = title_el.text
4103 format_id_el = metadata.find('format_id')
4104 if format_id_el is None:
4105 format = ext
4106 else:
4107 format = format_id_el.text
4108 description_el = metadata.find('description')
4109 if description_el is not None:
4110 description = description_el.text
4111 else:
4112 description = None
4113 imagePreview_el = metadata.find('imagePreview')
4114 if imagePreview_el is not None:
4115 thumbnail = imagePreview_el.text
4116 else:
4117 thumbnail = None
4118 info = {
4119 'id': video_id,
4120 'url': video_url,
4121 'title': title,
4122 'ext': extension,
4123 'format': format,
4124 'thumbnail': thumbnail,
4125 'description': description
4126 }
4127 return [info]
4128
4129 class SpiegelIE(InfoExtractor):
4130 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4131
4132 def _real_extract(self, url):
4133 m = re.match(self._VALID_URL, url)
4134 video_id = m.group('videoID')
4135
4136 webpage = self._download_webpage(url, video_id)
4137 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4138 if not m:
4139 raise ExtractorError(u'Cannot find title')
4140 video_title = unescapeHTML(m.group(1))
4141
4142 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4143 xml_code = self._download_webpage(xml_url, video_id,
4144 note=u'Downloading XML', errnote=u'Failed to download XML')
4145
4146 idoc = xml.etree.ElementTree.fromstring(xml_code)
4147 last_type = idoc[-1]
4148 filename = last_type.findall('./filename')[0].text
4149 duration = float(last_type.findall('./duration')[0].text)
4150
4151 video_url = 'http://video2.spiegel.de/flash/' + filename
4152 video_ext = filename.rpartition('.')[2]
4153 info = {
4154 'id': video_id,
4155 'url': video_url,
4156 'ext': video_ext,
4157 'title': video_title,
4158 'duration': duration,
4159 }
4160 return [info]
4161
4162 class LiveLeakIE(InfoExtractor):
4163
4164 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4165 IE_NAME = u'liveleak'
4166
4167 def _real_extract(self, url):
4168 mobj = re.match(self._VALID_URL, url)
4169 if mobj is None:
4170 self._downloader.report_error(u'invalid URL: %s' % url)
4171 return
4172
4173 video_id = mobj.group('video_id')
4174
4175 webpage = self._download_webpage(url, video_id)
4176
4177 m = re.search(r'file: "(.*?)",', webpage)
4178 if not m:
4179 self._downloader.report_error(u'unable to find video url')
4180 return
4181 video_url = m.group(1)
4182
4183 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4184 if not m:
4185 self._downloader.report_error(u'Cannot find video title')
4186 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4187
4188 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4189 if m:
4190 desc = unescapeHTML(m.group('desc'))
4191 else:
4192 desc = None
4193
4194 m = re.search(r'By:.*?(\w+)</a>', webpage)
4195 if m:
4196 uploader = clean_html(m.group(1))
4197 else:
4198 uploader = None
4199
4200 info = {
4201 'id': video_id,
4202 'url': video_url,
4203 'ext': 'mp4',
4204 'title': title,
4205 'description': desc,
4206 'uploader': uploader
4207 }
4208
4209 return [info]
4210
4211 class ARDIE(InfoExtractor):
4212 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4213 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4214 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4215
4216 def _real_extract(self, url):
4217 # determine video id from url
4218 m = re.match(self._VALID_URL, url)
4219
4220 numid = re.search(r'documentId=([0-9]+)', url)
4221 if numid:
4222 video_id = numid.group(1)
4223 else:
4224 video_id = m.group('video_id')
4225
4226 # determine title and media streams from webpage
4227 html = self._download_webpage(url, video_id)
4228 title = re.search(self._TITLE, html).group('title')
4229 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4230 if not streams:
4231 assert '"fsk"' in html
4232 self._downloader.report_error(u'this video is only available after 8:00 pm')
4233 return
4234
4235 # choose default media type and highest quality for now
4236 stream = max([s for s in streams if int(s["media_type"]) == 0],
4237 key=lambda s: int(s["quality"]))
4238
4239 # there's two possibilities: RTMP stream or HTTP download
4240 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4241 if stream['rtmp_url']:
4242 self.to_screen(u'RTMP download detected')
4243 assert stream['video_url'].startswith('mp4:')
4244 info["url"] = stream["rtmp_url"]
4245 info["play_path"] = stream['video_url']
4246 else:
4247 assert stream["video_url"].endswith('.mp4')
4248 info["url"] = stream["video_url"]
4249 return [info]
4250
4251 class TumblrIE(InfoExtractor):
4252 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4253
4254 def _real_extract(self, url):
4255 m_url = re.match(self._VALID_URL, url)
4256 video_id = m_url.group('id')
4257 blog = m_url.group('blog_name')
4258
4259 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4260 webpage = self._download_webpage(url, video_id)
4261
4262 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4263 video = re.search(re_video, webpage)
4264 if video is None:
4265 self.to_screen("No video founded")
4266 return []
4267 video_url = video.group('video_url')
4268 ext = video.group('ext')
4269
4270 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4271 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4272
4273 # The only place where you can get a title, it's not complete,
4274 # but searching in other places doesn't work for all videos
4275 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4276 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4277
4278 return [{'id': video_id,
4279 'url': video_url,
4280 'title': title,
4281 'thumbnail': thumb,
4282 'ext': ext
4283 }]
4284
4285
4286 def gen_extractors():
4287 """ Return a list of an instance of every supported extractor.
4288 The order does matter; the first extractor matched is the one handling the URL.
4289 """
4290 return [
4291 YoutubePlaylistIE(),
4292 YoutubeChannelIE(),
4293 YoutubeUserIE(),
4294 YoutubeSearchIE(),
4295 YoutubeIE(),
4296 MetacafeIE(),
4297 DailymotionIE(),
4298 GoogleSearchIE(),
4299 PhotobucketIE(),
4300 YahooIE(),
4301 YahooSearchIE(),
4302 DepositFilesIE(),
4303 FacebookIE(),
4304 BlipTVUserIE(),
4305 BlipTVIE(),
4306 VimeoIE(),
4307 MyVideoIE(),
4308 ComedyCentralIE(),
4309 EscapistIE(),
4310 CollegeHumorIE(),
4311 XVideosIE(),
4312 SoundcloudSetIE(),
4313 SoundcloudIE(),
4314 InfoQIE(),
4315 MixcloudIE(),
4316 StanfordOpenClassroomIE(),
4317 MTVIE(),
4318 YoukuIE(),
4319 XNXXIE(),
4320 YouJizzIE(),
4321 PornotubeIE(),
4322 YouPornIE(),
4323 GooglePlusIE(),
4324 ArteTvIE(),
4325 NBAIE(),
4326 WorldStarHipHopIE(),
4327 JustinTVIE(),
4328 FunnyOrDieIE(),
4329 SteamIE(),
4330 UstreamIE(),
4331 RBMARadioIE(),
4332 EightTracksIE(),
4333 KeekIE(),
4334 TEDIE(),
4335 MySpassIE(),
4336 SpiegelIE(),
4337 LiveLeakIE(),
4338 ARDIE(),
4339 TumblrIE(),
4340 GenericIE()
4341 ]
4342
4343 def get_info_extractor(ie_name):
4344 """Returns the info extractor class with the given ie_name"""
4345 return globals()[ie_name+'IE']
|