youtube_dl/extractor/channel9.py



    1 from __future__ import unicode_literals
    2 
    3 import re
    4 
    5 from .common import InfoExtractor
    6 from ..utils import (
    7     ExtractorError,
    8     parse_filesize,
    9     qualities,
   10 )
   11 
   12 
   13 class Channel9IE(InfoExtractor):
   14     '''
   15     Common extractor for channel9.msdn.com.
   16 
   17     The type of provided URL (video or playlist) is determined according to
   18     meta Search.PageType from web page HTML rather than URL itself, as it is
   19     not always possible to do.
   20     '''
   21     IE_DESC = 'Channel 9'
   22     IE_NAME = 'channel9'
   23     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
   24 
   25     _TESTS = [{
   26         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
   27         'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
   28         'info_dict': {
   29             'id': 'Events/TechEd/Australia/2013/KOS002',
   30             'ext': 'mp4',
   31             'title': 'Developer Kick-Off Session: Stuff We Love',
   32             'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
   33             'duration': 4576,
   34             'thumbnail': r're:http://.*\.jpg',
   35             'session_code': 'KOS002',
   36             'session_day': 'Day 1',
   37             'session_room': 'Arena 1A',
   38             'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
   39                                  'Mads Kristensen'],
   40         },
   41     }, {
   42         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
   43         'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
   44         'info_dict': {
   45             'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
   46             'ext': 'mp4',
   47             'title': 'Self-service BI with Power BI - nuclear testing',
   48             'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
   49             'duration': 1540,
   50             'thumbnail': r're:http://.*\.jpg',
   51             'authors': ['Mike Wilmot'],
   52         },
   53     }, {
   54         # low quality mp4 is best
   55         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
   56         'info_dict': {
   57             'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
   58             'ext': 'mp4',
   59             'title': 'Ranges for the Standard Library',
   60             'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
   61             'duration': 5646,
   62             'thumbnail': r're:http://.*\.jpg',
   63         },
   64         'params': {
   65             'skip_download': True,
   66         },
   67     }, {
   68         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
   69         'info_dict': {
   70             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
   71             'title': 'Channel 9',
   72         },
   73         'playlist_count': 2,
   74     }, {
   75         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
   76         'only_matching': True,
   77     }, {
   78         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
   79         'only_matching': True,
   80     }]
   81 
   82     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
   83 
   84     def _formats_from_html(self, html):
   85         FORMAT_REGEX = r'''
   86             (?x)
   87             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
   88             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
   89             (?:<div\s+class="popup\s+rounded">\s*
   90             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
   91             </div>)?                                                # File size part may be missing
   92         '''
   93         quality = qualities((
   94             'MP3', 'MP4',
   95             'Low Quality WMV', 'Low Quality MP4',
   96             'Mid Quality WMV', 'Mid Quality MP4',
   97             'High Quality WMV', 'High Quality MP4'))
   98         formats = [{
   99             'url': x.group('url'),
  100             'format_id': x.group('quality'),
  101             'format_note': x.group('note'),
  102             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  103             'filesize_approx': parse_filesize(x.group('filesize')),
  104             'quality': quality(x.group('quality')),
  105             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  106         } for x in list(re.finditer(FORMAT_REGEX, html))]
  107 
  108         self._sort_formats(formats)
  109 
  110         return formats
  111 
  112     def _extract_title(self, html):
  113         title = self._html_search_meta('title', html, 'title')
  114         if title is None:
  115             title = self._og_search_title(html)
  116             TITLE_SUFFIX = ' (Channel 9)'
  117             if title is not None and title.endswith(TITLE_SUFFIX):
  118                 title = title[:-len(TITLE_SUFFIX)]
  119         return title
  120 
  121     def _extract_description(self, html):
  122         DESCRIPTION_REGEX = r'''(?sx)
  123             <div\s+class="entry-content">\s*
  124             <div\s+id="entry-body">\s*
  125             (?P<description>.+?)\s*
  126             </div>\s*
  127             </div>
  128         '''
  129         m = re.search(DESCRIPTION_REGEX, html)
  130         if m is not None:
  131             return m.group('description')
  132         return self._html_search_meta('description', html, 'description')
  133 
  134     def _extract_duration(self, html):
  135         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
  136         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
  137 
  138     def _extract_slides(self, html):
  139         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
  140         return m.group('slidesurl') if m is not None else None
  141 
  142     def _extract_zip(self, html):
  143         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
  144         return m.group('zipurl') if m is not None else None
  145 
  146     def _extract_avg_rating(self, html):
  147         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
  148         return float(m.group('avgrating')) if m is not None else 0
  149 
  150     def _extract_rating_count(self, html):
  151         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
  152         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
  153 
  154     def _extract_view_count(self, html):
  155         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
  156         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
  157 
  158     def _extract_comment_count(self, html):
  159         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
  160         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
  161 
  162     def _fix_count(self, count):
  163         return int(str(count).replace(',', '')) if count is not None else None
  164 
  165     def _extract_authors(self, html):
  166         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
  167         if m is None:
  168             return None
  169         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
  170 
  171     def _extract_session_code(self, html):
  172         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
  173         return m.group('code') if m is not None else None
  174 
  175     def _extract_session_day(self, html):
  176         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
  177         return m.group('day').strip() if m is not None else None
  178 
  179     def _extract_session_room(self, html):
  180         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
  181         return m.group('room') if m is not None else None
  182 
  183     def _extract_session_speakers(self, html):
  184         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
  185 
  186     def _extract_content(self, html, content_path):
  187         # Look for downloadable content
  188         formats = self._formats_from_html(html)
  189         slides = self._extract_slides(html)
  190         zip_ = self._extract_zip(html)
  191 
  192         # Nothing to download
  193         if len(formats) == 0 and slides is None and zip_ is None:
  194             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
  195             return
  196 
  197         # Extract meta
  198         title = self._extract_title(html)
  199         description = self._extract_description(html)
  200         thumbnail = self._og_search_thumbnail(html)
  201         duration = self._extract_duration(html)
  202         avg_rating = self._extract_avg_rating(html)
  203         rating_count = self._extract_rating_count(html)
  204         view_count = self._extract_view_count(html)
  205         comment_count = self._extract_comment_count(html)
  206 
  207         common = {
  208             '_type': 'video',
  209             'id': content_path,
  210             'description': description,
  211             'thumbnail': thumbnail,
  212             'duration': duration,
  213             'avg_rating': avg_rating,
  214             'rating_count': rating_count,
  215             'view_count': view_count,
  216             'comment_count': comment_count,
  217         }
  218 
  219         result = []
  220 
  221         if slides is not None:
  222             d = common.copy()
  223             d.update({'title': title + '-Slides', 'url': slides})
  224             result.append(d)
  225 
  226         if zip_ is not None:
  227             d = common.copy()
  228             d.update({'title': title + '-Zip', 'url': zip_})
  229             result.append(d)
  230 
  231         if len(formats) > 0:
  232             d = common.copy()
  233             d.update({'title': title, 'formats': formats})
  234             result.append(d)
  235 
  236         return result
  237 
  238     def _extract_entry_item(self, html, content_path):
  239         contents = self._extract_content(html, content_path)
  240         if contents is None:
  241             return contents
  242 
  243         if len(contents) > 1:
  244             raise ExtractorError('Got more than one entry')
  245         result = contents[0]
  246         result['authors'] = self._extract_authors(html)
  247 
  248         return result
  249 
  250     def _extract_session(self, html, content_path):
  251         contents = self._extract_content(html, content_path)
  252         if contents is None:
  253             return contents
  254 
  255         session_meta = {
  256             'session_code': self._extract_session_code(html),
  257             'session_day': self._extract_session_day(html),
  258             'session_room': self._extract_session_room(html),
  259             'session_speakers': self._extract_session_speakers(html),
  260         }
  261 
  262         for content in contents:
  263             content.update(session_meta)
  264 
  265         return self.playlist_result(contents)
  266 
  267     def _extract_list(self, video_id, rss_url=None):
  268         if not rss_url:
  269             rss_url = self._RSS_URL % video_id
  270         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
  271         entries = [self.url_result(session_url.text, 'Channel9')
  272                    for session_url in rss.findall('./channel/item/link')]
  273         title_text = rss.find('./channel/title').text
  274         return self.playlist_result(entries, video_id, title_text)
  275 
  276     def _real_extract(self, url):
  277         mobj = re.match(self._VALID_URL, url)
  278         content_path = mobj.group('contentpath')
  279         rss = mobj.group('rss')
  280 
  281         if rss:
  282             return self._extract_list(content_path, url)
  283 
  284         webpage = self._download_webpage(
  285             url, content_path, 'Downloading web page')
  286 
  287         page_type = self._search_regex(
  288             r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
  289             webpage, 'page type', default=None, group='pagetype')
  290         if page_type:
  291             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
  292                 return self._extract_entry_item(webpage, content_path)
  293             elif page_type == 'Session':  # Event session page, may contain downloadable content
  294                 return self._extract_session(webpage, content_path)
  295             elif page_type == 'Event':
  296                 return self._extract_list(content_path)
  297             else:
  298                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
  299         else:  # Assuming list
  300             return self._extract_list(content_path)