summaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/buzzfeed.py
blob: ec411091efe7dc15b28c7f4a3939bff89395fa59 (plain)
    1 # coding: utf-8
    2 from __future__ import unicode_literals
    3 
    4 import json
    5 import re
    6 
    7 from .common import InfoExtractor
    8 from .facebook import FacebookIE
    9 
   10 
   11 class BuzzFeedIE(InfoExtractor):
   12     _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)'
   13     _TESTS = [{
   14         'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia',
   15         'info_dict': {
   16             'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss',
   17             'title': 'This Angry Ram Destroys A Punching Bag Like A Boss',
   18             'description': 'Rambro!',
   19         },
   20         'playlist': [{
   21             'info_dict': {
   22                 'id': 'aVCR29aE_OQ',
   23                 'ext': 'mp4',
   24                 'title': 'Angry Ram destroys a punching bag..',
   25                 'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
   26                 'upload_date': '20141024',
   27                 'uploader_id': 'Buddhanz1',
   28                 'uploader': 'Angry Ram',
   29             }
   30         }]
   31     }, {
   32         'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia',
   33         'params': {
   34             'skip_download': True,  # Got enough YouTube download tests
   35         },
   36         'info_dict': {
   37             'id': 'look-at-this-cute-dog-omg',
   38             'description': 're:Munchkin the Teddy Bear is back ?!',
   39             'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
   40         },
   41         'playlist': [{
   42             'info_dict': {
   43                 'id': 'mVmBL8B-In0',
   44                 'ext': 'mp4',
   45                 'title': 're:Munchkin the Teddy Bear gets her exercise',
   46                 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
   47                 'upload_date': '20141124',
   48                 'uploader_id': 'CindysMunchkin',
   49                 'uploader': 're:^Munchkin the',
   50             },
   51         }]
   52     }, {
   53         'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
   54         'info_dict': {
   55             'id': 'the-most-adorable-crash-landing-ever',
   56             'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
   57             'description': 'This gosling knows how to stick a landing.',
   58         },
   59         'playlist': [{
   60             'md5': '763ca415512f91ca62e4621086900a23',
   61             'info_dict': {
   62                 'id': '971793786185728',
   63                 'ext': 'mp4',
   64                 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
   65                 'uploader': 'Calgary Outdoor Centre-University of Calgary',
   66             },
   67         }],
   68         'add_ie': ['Facebook'],
   69     }]
   70 
   71     def _real_extract(self, url):
   72         playlist_id = self._match_id(url)
   73         webpage = self._download_webpage(url, playlist_id)
   74 
   75         all_buckets = re.findall(
   76             r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'',
   77             webpage)
   78 
   79         entries = []
   80         for bd_json in all_buckets:
   81             bd = json.loads(bd_json)
   82             video = bd.get('video') or bd.get('progload_video')
   83             if not video:
   84                 continue
   85             entries.append(self.url_result(video['url']))
   86 
   87         facebook_urls = FacebookIE._extract_urls(webpage)
   88         entries.extend([
   89             self.url_result(facebook_url)
   90             for facebook_url in facebook_urls])
   91 
   92         return {
   93             '_type': 'playlist',
   94             'id': playlist_id,
   95             'title': self._og_search_title(webpage),
   96             'description': self._og_search_description(webpage),
   97             'entries': entries,
   98         }

Generated by cgit