youtube_dl/extractor/yam.py



    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import re
    5 
    6 from .common import InfoExtractor
    7 from ..compat import compat_urlparse
    8 from ..utils import (
    9     float_or_none,
   10     month_by_abbreviation,
   11     ExtractorError,
   12     get_element_by_attribute,
   13 )
   14 
   15 
   16 class YamIE(InfoExtractor):
   17     IE_DESC = '蕃薯藤yam天空部落'
   18     _VALID_URL = r'https?://mymedia\.yam\.com/m/(?P<id>\d+)'
   19 
   20     _TESTS = [{
   21         # An audio hosted on Yam
   22         'url': 'http://mymedia.yam.com/m/2283921',
   23         'md5': 'c011b8e262a52d5473d9c2e3c9963b9c',
   24         'info_dict': {
   25             'id': '2283921',
   26             'ext': 'mp3',
   27             'title': '發現 - 趙薇 京華煙雲主題曲',
   28             'description': '發現 - 趙薇 京華煙雲主題曲',
   29             'uploader_id': 'princekt',
   30             'upload_date': '20080807',
   31             'duration': 313.0,
   32         }
   33     }, {
   34         # An external video hosted on YouTube
   35         'url': 'http://mymedia.yam.com/m/3599430',
   36         'md5': '03127cf10d8f35d120a9e8e52e3b17c6',
   37         'info_dict': {
   38             'id': 'CNpEoQlrIgA',
   39             'ext': 'mp4',
   40             'upload_date': '20150306',
   41             'uploader': '新莊社大瑜伽社',
   42             'description': 'md5:11e2e405311633ace874f2e6226c8b17',
   43             'uploader_id': '2323agoy',
   44             'title': '20090412陽明山二子坪-1',
   45         },
   46         'skip': 'Video does not exist',
   47     }, {
   48         'url': 'http://mymedia.yam.com/m/3598173',
   49         'info_dict': {
   50             'id': '3598173',
   51             'ext': 'mp4',
   52         },
   53         'skip': 'cause Yam system error',
   54     }, {
   55         'url': 'http://mymedia.yam.com/m/3599437',
   56         'info_dict': {
   57             'id': '3599437',
   58             'ext': 'mp4',
   59         },
   60         'skip': 'invalid YouTube URL',
   61     }, {
   62         'url': 'http://mymedia.yam.com/m/2373534',
   63         'md5': '7ff74b91b7a817269d83796f8c5890b1',
   64         'info_dict': {
   65             'id': '2373534',
   66             'ext': 'mp3',
   67             'title': '林俊傑&蔡卓妍-小酒窩',
   68             'description': 'md5:904003395a0fcce6cfb25028ff468420',
   69             'upload_date': '20080928',
   70             'uploader_id': 'onliner2',
   71         }
   72     }]
   73 
   74     def _real_extract(self, url):
   75         video_id = self._match_id(url)
   76         page = self._download_webpage(url, video_id)
   77 
   78         # Check for errors
   79         system_msg = self._html_search_regex(
   80             r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message',
   81             default=None)
   82         if system_msg:
   83             raise ExtractorError(system_msg, expected=True)
   84 
   85         # Is it hosted externally on YouTube?
   86         youtube_url = self._html_search_regex(
   87             r'<embed src="(http://www.youtube.com/[^"]+)"',
   88             page, 'YouTube url', default=None)
   89         if youtube_url:
   90             return self.url_result(youtube_url, 'Youtube')
   91 
   92         title = self._html_search_regex(
   93             r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
   94 
   95         api_page = self._download_webpage(
   96             'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
   97             note='Downloading API page')
   98         api_result_obj = compat_urlparse.parse_qs(api_page)
   99 
  100         info_table = get_element_by_attribute('class', 'info', page)
  101         uploader_id = self._html_search_regex(
  102             r'<!-- 發表作者 -->：[\n ]+<a href="/([a-z0-9]+)"',
  103             info_table, 'uploader id', fatal=False)
  104         mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
  105                          r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
  106         if mobj:
  107             upload_date = '%s%02d%02d' % (
  108                 mobj.group('year'),
  109                 month_by_abbreviation(mobj.group('mon')),
  110                 int(mobj.group('day')))
  111         else:
  112             upload_date = None
  113         duration = float_or_none(api_result_obj['totaltime'][0], scale=1000)
  114 
  115         return {
  116             'id': video_id,
  117             'url': api_result_obj['mp3file'][0],
  118             'title': title,
  119             'description': self._html_search_meta('description', page),
  120             'duration': duration,
  121             'uploader_id': uploader_id,
  122             'upload_date': upload_date,
  123         }