[soundcloud] improve extraction

- improve format extraction(closes #22123)
- extract uploader_id and uploader_url(closes #21916)
- extract all known thumbnails(closes #19071)(closes #20659)
- fix extration for private playlists(closes #20976)
- add support for playlist embeds(#20976)
- skip preview formats(closes #22806)
This commit is contained in:
Remita Amine 2019-10-27 17:52:46 +01:00
parent 0b98f3a751
commit 548c395716
3 changed files with 248 additions and 256 deletions

View File

@ -1033,6 +1033,7 @@
from .sohu import SohuIE from .sohu import SohuIE
from .sonyliv import SonyLIVIE from .sonyliv import SonyLIVIE
from .soundcloud import ( from .soundcloud import (
SoundcloudEmbedIE,
SoundcloudIE, SoundcloudIE,
SoundcloudSetIE, SoundcloudSetIE,
SoundcloudUserIE, SoundcloudUserIE,

View File

@ -80,7 +80,7 @@
from .kaltura import KalturaIE from .kaltura import KalturaIE
from .eagleplatform import EaglePlatformIE from .eagleplatform import EaglePlatformIE
from .facebook import FacebookIE from .facebook import FacebookIE
from .soundcloud import SoundcloudIE from .soundcloud import SoundcloudEmbedIE
from .tunein import TuneInBaseIE from .tunein import TuneInBaseIE
from .vbox7 import Vbox7IE from .vbox7 import Vbox7IE
from .dbtv import DBTVIE from .dbtv import DBTVIE
@ -2749,9 +2749,9 @@ def _real_extract(self, url):
return self.url_result(myvi_url) return self.url_result(myvi_url)
# Look for embedded soundcloud player # Look for embedded soundcloud player
soundcloud_urls = SoundcloudIE._extract_urls(webpage) soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage)
if soundcloud_urls: if soundcloud_urls:
return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML)
# Look for tunein player # Look for tunein player
tunein_urls = TuneInBaseIE._extract_urls(webpage) tunein_urls = TuneInBaseIE._extract_urls(webpage)

View File

@ -11,14 +11,13 @@
from ..compat import ( from ..compat import (
compat_str, compat_str,
compat_urlparse, compat_urlparse,
compat_urllib_parse_urlencode,
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
float_or_none, float_or_none,
HEADRequest,
int_or_none, int_or_none,
KNOWN_EXTENSIONS, KNOWN_EXTENSIONS,
merge_dicts,
mimetype2ext, mimetype2ext,
str_or_none, str_or_none,
try_get, try_get,
@ -28,6 +27,20 @@
) )
class SoundcloudEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?url=(?P<id>.*)'
@staticmethod
def _extract_urls(webpage):
return [m.group('url') for m in re.finditer(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
webpage)]
def _real_extract(self, url):
return self.url_result(compat_urlparse.parse_qs(
compat_urlparse.urlparse(url).query)['url'][0])
class SoundcloudIE(InfoExtractor): class SoundcloudIE(InfoExtractor):
"""Information extractor for soundcloud.com """Information extractor for soundcloud.com
To access the media, the uid of the song and a stream token To access the media, the uid of the song and a stream token
@ -44,9 +57,8 @@ class SoundcloudIE(InfoExtractor):
(?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/? (?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$) (?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
(?:/?\?secret_token=(?P<secret_token>[^&]+))?) (?:/?\?secret_token=(?P<secret_token>[^&]+))?)
|(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
) )
''' '''
IE_NAME = 'soundcloud' IE_NAME = 'soundcloud'
@ -60,6 +72,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music', 'uploader': 'E.T. ExTerrestrial Music',
'uploader_id': '1571244',
'timestamp': 1349920598, 'timestamp': 1349920598,
'upload_date': '20121011', 'upload_date': '20121011',
'duration': 143.216, 'duration': 143.216,
@ -79,6 +92,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Goldrushed', 'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept', 'uploader': 'The Royal Concept',
'uploader_id': '9615865',
'timestamp': 1337635207, 'timestamp': 1337635207,
'upload_date': '20120521', 'upload_date': '20120521',
'duration': 30, 'duration': 30,
@ -92,6 +106,7 @@ class SoundcloudIE(InfoExtractor):
# rtmp # rtmp
'skip_download': True, 'skip_download': True,
}, },
'skip': 'Preview',
}, },
# private link # private link
{ {
@ -103,6 +118,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Youtube - Dl Test Video \'\' Ä↭', 'title': 'Youtube - Dl Test Video \'\' Ä↭',
'description': 'test chars: \"\'/\\ä↭', 'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF', 'uploader': 'jaimeMF',
'uploader_id': '69767071',
'timestamp': 1386604920, 'timestamp': 1386604920,
'upload_date': '20131209', 'upload_date': '20131209',
'duration': 9.927, 'duration': 9.927,
@ -123,6 +139,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Youtube - Dl Test Video \'\' Ä↭', 'title': 'Youtube - Dl Test Video \'\' Ä↭',
'description': 'test chars: \"\'/\\ä↭', 'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF', 'uploader': 'jaimeMF',
'uploader_id': '69767071',
'timestamp': 1386604920, 'timestamp': 1386604920,
'upload_date': '20131209', 'upload_date': '20131209',
'duration': 9.927, 'duration': 9.927,
@ -143,6 +160,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Bus Brakes', 'title': 'Bus Brakes',
'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
'uploader': 'oddsamples', 'uploader': 'oddsamples',
'uploader_id': '73680509',
'timestamp': 1389232924, 'timestamp': 1389232924,
'upload_date': '20140109', 'upload_date': '20140109',
'duration': 17.346, 'duration': 17.346,
@ -163,6 +181,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
'uploader': 'Ori Uplift Music', 'uploader': 'Ori Uplift Music',
'uploader_id': '12563093',
'timestamp': 1504206263, 'timestamp': 1504206263,
'upload_date': '20170831', 'upload_date': '20170831',
'duration': 7449.096, 'duration': 7449.096,
@ -183,6 +202,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Sideways (Prod. Mad Real)', 'title': 'Sideways (Prod. Mad Real)',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'uploader': 'garyvee', 'uploader': 'garyvee',
'uploader_id': '2366352',
'timestamp': 1488152409, 'timestamp': 1488152409,
'upload_date': '20170226', 'upload_date': '20170226',
'duration': 207.012, 'duration': 207.012,
@ -207,6 +227,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Mezzo Valzer', 'title': 'Mezzo Valzer',
'description': 'md5:4138d582f81866a530317bae316e8b61', 'description': 'md5:4138d582f81866a530317bae316e8b61',
'uploader': 'Giovanni Sarani', 'uploader': 'Giovanni Sarani',
'uploader_id': '3352531',
'timestamp': 1551394171, 'timestamp': 1551394171,
'upload_date': '20190228', 'upload_date': '20190228',
'duration': 180.157, 'duration': 180.157,
@ -221,114 +242,81 @@ class SoundcloudIE(InfoExtractor):
} }
] ]
_API_BASE = 'https://api.soundcloud.com/'
_API_V2_BASE = 'https://api-v2.soundcloud.com/'
_BASE_URL = 'https://soundcloud.com/'
_CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI' _CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI'
_IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
@staticmethod _ARTWORK_MAP = {
def _extract_urls(webpage): 'mini': 16,
return [m.group('url') for m in re.finditer( 'tiny': 20,
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', 'small': 32,
webpage)] 'badge': 47,
't67x67': 67,
'large': 100,
't300x300': 300,
'crop': 400,
't500x500': 500,
'original': 0,
}
@classmethod @classmethod
def _resolv_url(cls, url): def _resolv_url(cls, url):
return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID
def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2):
track_id = compat_str(info['id']) track_id = compat_str(info['id'])
title = info['title'] title = info['title']
name = full_title or track_id track_base_url = self._API_BASE + 'tracks/%s' % track_id
if quiet:
self.report_extraction(name)
thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url')
if isinstance(thumbnail, compat_str):
thumbnail = thumbnail.replace('-large', '-t500x500')
username = try_get(info, lambda x: x['user']['username'], compat_str)
def extract_count(key):
return int_or_none(info.get('%s_count' % key))
like_count = extract_count('favoritings')
if like_count is None:
like_count = extract_count('likes')
result = {
'id': track_id,
'uploader': username,
'timestamp': unified_timestamp(info.get('created_at')),
'title': title,
'description': info.get('description'),
'thumbnail': thumbnail,
'duration': float_or_none(info.get('duration'), 1000),
'webpage_url': info.get('permalink_url'),
'license': info.get('license'),
'view_count': extract_count('playback'),
'like_count': like_count,
'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'),
'genre': info.get('genre'),
}
format_urls = set() format_urls = set()
formats = [] formats = []
query = {'client_id': self._CLIENT_ID} query = {'client_id': self._CLIENT_ID}
if secret_token is not None: if secret_token:
query['secret_token'] = secret_token query['secret_token'] = secret_token
if info.get('downloadable', False):
# We can build a direct link to the song if info.get('downloadable'):
format_url = update_url_query( format_url = update_url_query(
'https://api.soundcloud.com/tracks/%s/download' % track_id, query) info.get('download_url') or track_base_url + '/download', query)
format_urls.add(format_url) format_urls.add(format_url)
if version == 2:
v1_info = self._download_json(
track_base_url, track_id, query=query, fatal=False) or {}
else:
v1_info = info
formats.append({ formats.append({
'format_id': 'download', 'format_id': 'download',
'ext': info.get('original_format', 'mp3'), 'ext': v1_info.get('original_format') or 'mp3',
'filesize': int_or_none(v1_info.get('original_content_size')),
'url': format_url, 'url': format_url,
'vcodec': 'none',
'preference': 10, 'preference': 10,
}) })
# Old API, does not work for some tracks (e.g. def invalid_url(url):
# https://soundcloud.com/giovannisarani/mezzo-valzer) return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url)
format_dict = self._download_json(
'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id,
track_id, 'Downloading track url', query=query, fatal=False)
if format_dict: def add_format(f, protocol):
for key, stream_url in format_dict.items(): mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url)
if stream_url in format_urls:
continue
format_urls.add(stream_url)
ext, abr = 'mp3', None
mobj = re.search(r'_([^_]+)_(\d+)_url', key)
if mobj: if mobj:
ext, abr = mobj.groups() for k, v in mobj.groupdict().items():
abr = int(abr) if not f.get(k):
if key.startswith('http'): f[k] = v
stream_formats = [{ format_id_list = []
'format_id': key, if protocol:
'ext': ext, format_id_list.append(protocol)
'url': stream_url, for k in ('ext', 'abr'):
}] v = f.get(k)
elif key.startswith('rtmp'): if v:
# The url doesn't have an rtmp app, we have to extract the playpath format_id_list.append(v)
url, path = stream_url.split('mp3:', 1) abr = f.get('abr')
stream_formats = [{
'format_id': key,
'url': url,
'play_path': 'mp3:' + path,
'ext': 'flv',
}]
elif key.startswith('hls'):
stream_formats = self._extract_m3u8_formats(
stream_url, track_id, ext, entry_protocol='m3u8_native',
m3u8_id=key, fatal=False)
else:
continue
if abr: if abr:
for f in stream_formats: f['abr'] = int(abr)
f['abr'] = abr f.update({
'format_id': '_'.join(format_id_list),
formats.extend(stream_formats) 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
})
formats.append(f)
# New API # New API
transcodings = try_get( transcodings = try_get(
@ -337,19 +325,18 @@ def extract_count(key):
if not isinstance(t, dict): if not isinstance(t, dict):
continue continue
format_url = url_or_none(t.get('url')) format_url = url_or_none(t.get('url'))
if not format_url: if not format_url or t.get('snipped') or '/preview/' in format_url:
continue continue
stream = self._download_json( stream = self._download_json(
update_url_query(format_url, query), track_id, fatal=False) format_url, track_id, query=query, fatal=False)
if not isinstance(stream, dict): if not isinstance(stream, dict):
continue continue
stream_url = url_or_none(stream.get('url')) stream_url = url_or_none(stream.get('url'))
if not stream_url: if invalid_url(stream_url):
continue
if stream_url in format_urls:
continue continue
format_urls.add(stream_url) format_urls.add(stream_url)
protocol = try_get(t, lambda x: x['format']['protocol'], compat_str) stream_format = t.get('format') or {}
protocol = stream_format.get('protocol')
if protocol != 'hls' and '/hls' in format_url: if protocol != 'hls' and '/hls' in format_url:
protocol = 'hls' protocol = 'hls'
ext = None ext = None
@ -357,109 +344,146 @@ def extract_count(key):
if preset: if preset:
ext = preset.split('_')[0] ext = preset.split('_')[0]
if ext not in KNOWN_EXTENSIONS: if ext not in KNOWN_EXTENSIONS:
mimetype = try_get( ext = mimetype2ext(stream_format.get('mime_type'))
t, lambda x: x['format']['mime_type'], compat_str) add_format({
ext = mimetype2ext(mimetype) or 'mp3'
format_id_list = []
if protocol:
format_id_list.append(protocol)
format_id_list.append(ext)
format_id = '_'.join(format_id_list)
formats.append({
'url': stream_url, 'url': stream_url,
'format_id': format_id,
'ext': ext, 'ext': ext,
'protocol': 'm3u8_native' if protocol == 'hls' else 'http', }, 'http' if protocol == 'progressive' else protocol)
})
if not formats:
# Old API, does not work for some tracks (e.g.
# https://soundcloud.com/giovannisarani/mezzo-valzer)
# and might serve preview URLs (e.g.
# http://www.soundcloud.com/snbrn/ele)
format_dict = self._download_json(
track_base_url + '/streams', track_id,
'Downloading track url', query=query, fatal=False) or {}
for key, stream_url in format_dict.items():
if invalid_url(stream_url):
continue
format_urls.add(stream_url)
mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key)
if mobj:
protocol, ext, abr = mobj.groups()
add_format({
'abr': abr,
'ext': ext,
'url': stream_url,
}, protocol)
if not formats: if not formats:
# We fallback to the stream_url in the original info, this # We fallback to the stream_url in the original info, this
# cannot be always used, sometimes it can give an HTTP 404 error # cannot be always used, sometimes it can give an HTTP 404 error
formats.append({ urlh = self._request_webpage(
'format_id': 'fallback', HEADRequest(info.get('stream_url') or track_base_url + '/stream'),
'url': update_url_query(info['stream_url'], query), track_id, query=query, fatal=False)
'ext': 'mp3', if urlh:
}) stream_url = urlh.geturl()
self._check_formats(formats, track_id) if not invalid_url(stream_url):
add_format({'url': stream_url}, 'http')
for f in formats: for f in formats:
f['vcodec'] = 'none' f['vcodec'] = 'none'
self._sort_formats(formats) self._sort_formats(formats)
result['formats'] = formats
return result user = info.get('user') or {}
thumbnails = []
artwork_url = info.get('artwork_url')
thumbnail = artwork_url or user.get('avatar_url')
if isinstance(thumbnail, compat_str):
if re.search(self._IMAGE_REPL_RE, thumbnail):
for image_id, size in self._ARTWORK_MAP.items():
i = {
'id': image_id,
'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail),
}
if image_id == 'tiny' and not artwork_url:
size = 18
elif image_id == 'original':
i['preference'] = 10
if size:
i.update({
'width': size,
'height': size,
})
thumbnails.append(i)
else:
thumbnails = [{'url': thumbnail}]
def extract_count(key):
return int_or_none(info.get('%s_count' % key))
return {
'id': track_id,
'uploader': user.get('username'),
'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
'uploader_url': user.get('permalink_url'),
'timestamp': unified_timestamp(info.get('created_at')),
'title': title,
'description': info.get('description'),
'thumbnails': thumbnails,
'duration': float_or_none(info.get('duration'), 1000),
'webpage_url': info.get('permalink_url'),
'license': info.get('license'),
'view_count': extract_count('playback'),
'like_count': extract_count('favoritings') or extract_count('likes'),
'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'),
'genre': info.get('genre'),
'formats': formats
}
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
track_id = mobj.group('track_id') track_id = mobj.group('track_id')
new_info = {}
if track_id is not None: query = {
info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID 'client_id': self._CLIENT_ID,
}
if track_id:
info_json_url = self._API_V2_BASE + 'tracks/' + track_id
full_title = track_id full_title = track_id
token = mobj.group('secret_token') token = mobj.group('secret_token')
if token: if token:
info_json_url += '&secret_token=' + token query['secret_token'] = token
elif mobj.group('player'):
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
real_url = query['url'][0]
# If the token is in the query of the original url we have to
# manually add it
if 'secret_token' in query:
real_url += '?secret_token=' + query['secret_token'][0]
return self.url_result(real_url)
else: else:
# extract uploader (which is in the url) full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title')
uploader = mobj.group('uploader')
# extract simple title (uploader + slug of song title)
slug_title = mobj.group('title')
token = mobj.group('token') token = mobj.group('token')
full_title = resolve_title = '%s/%s' % (uploader, slug_title)
if token: if token:
resolve_title += '/%s' % token resolve_title += '/%s' % token
info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
webpage = self._download_webpage(url, full_title, fatal=False) version = 2
if webpage:
entries = self._parse_json(
self._search_regex(
r'var\s+c\s*=\s*(\[.+?\])\s*,\s*o\s*=Date\b', webpage,
'data', default='[]'), full_title, fatal=False)
if entries:
for e in entries:
if not isinstance(e, dict):
continue
if e.get('id') != 67:
continue
data = try_get(e, lambda x: x['data'][0], dict)
if data:
new_info = data
break
info_json_url = self._resolv_url(
'https://soundcloud.com/%s' % resolve_title)
# Contains some additional info missing from new_info
info = self._download_json( info = self._download_json(
info_json_url, full_title, 'Downloading info JSON') info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False)
if not info:
info = self._download_json(
info_json_url.replace(self._API_V2_BASE, self._API_BASE),
full_title, 'Downloading info JSON', query=query)
version = 1
return self._extract_info_dict( return self._extract_info_dict(info, full_title, token, version)
merge_dicts(info, new_info), full_title, secret_token=token)
class SoundcloudPlaylistBaseIE(SoundcloudIE): class SoundcloudPlaylistBaseIE(SoundcloudIE):
@staticmethod def _extract_track_entries(self, tracks, token=None):
def _extract_id(e): entries = []
return compat_str(e['id']) if e.get('id') else None for track in tracks:
track_id = str_or_none(track.get('id'))
def _extract_track_entries(self, tracks): url = track.get('permalink_url')
return [ if not url:
self.url_result( if not track_id:
track['permalink_url'], SoundcloudIE.ie_key(), continue
video_id=self._extract_id(track)) url = self._API_V2_BASE + 'tracks/' + track_id
for track in tracks if track.get('permalink_url')] if token:
url += '?secret_token=' + token
entries.append(self.url_result(
url, SoundcloudIE.ie_key(), track_id))
return entries
class SoundcloudSetIE(SoundcloudPlaylistBaseIE): class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
@ -480,41 +504,28 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
# extract uploader (which is in the url) full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title')
uploader = mobj.group('uploader')
# extract simple title (uploader + slug of song title)
slug_title = mobj.group('slug_title')
full_title = '%s/sets/%s' % (uploader, slug_title)
url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
token = mobj.group('token') token = mobj.group('token')
if token: if token:
full_title += '/' + token full_title += '/' + token
url += '/' + token
resolv_url = self._resolv_url(url) info = self._download_json(self._resolv_url(
info = self._download_json(resolv_url, full_title) self._BASE_URL + full_title), full_title)
if 'errors' in info: if 'errors' in info:
msgs = (compat_str(err['error_message']) for err in info['errors']) msgs = (compat_str(err['error_message']) for err in info['errors'])
raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
entries = self._extract_track_entries(info['tracks']) entries = self._extract_track_entries(info['tracks'], token)
return { return self.playlist_result(
'_type': 'playlist', entries, str_or_none(info.get('id')), info.get('title'))
'entries': entries,
'id': '%s' % info['id'],
'title': info['title'],
}
class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
_API_V2_BASE = 'https://api-v2.soundcloud.com'
def _extract_playlist(self, base_url, playlist_id, playlist_title): def _extract_playlist(self, base_url, playlist_id, playlist_title):
COMMON_QUERY = { COMMON_QUERY = {
'limit': 50, 'limit': 2000000000,
'client_id': self._CLIENT_ID, 'client_id': self._CLIENT_ID,
'linked_partitioning': '1', 'linked_partitioning': '1',
} }
@ -522,12 +533,13 @@ def _extract_playlist(self, base_url, playlist_id, playlist_title):
query = COMMON_QUERY.copy() query = COMMON_QUERY.copy()
query['offset'] = 0 query['offset'] = 0
next_href = base_url + '?' + compat_urllib_parse_urlencode(query) next_href = base_url
entries = [] entries = []
for i in itertools.count(): for i in itertools.count():
response = self._download_json( response = self._download_json(
next_href, playlist_id, 'Downloading track page %s' % (i + 1)) next_href, playlist_id,
'Downloading track page %s' % (i + 1), query=query)
collection = response['collection'] collection = response['collection']
@ -546,9 +558,8 @@ def resolve_entry(candidates):
continue continue
return self.url_result( return self.url_result(
permalink_url, permalink_url,
ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
video_id=self._extract_id(cand), str_or_none(cand.get('id')), cand.get('title'))
video_title=cand.get('title'))
for e in collection: for e in collection:
entry = resolve_entry((e, e.get('track'), e.get('playlist'))) entry = resolve_entry((e, e.get('track'), e.get('playlist')))
@ -559,11 +570,10 @@ def resolve_entry(candidates):
if not next_href: if not next_href:
break break
parsed_next_href = compat_urlparse.urlparse(response['next_href']) next_href = response['next_href']
qs = compat_urlparse.parse_qs(parsed_next_href.query) parsed_next_href = compat_urlparse.urlparse(next_href)
qs.update(COMMON_QUERY) query = compat_urlparse.parse_qs(parsed_next_href.query)
next_href = compat_urlparse.urlunparse( query.update(COMMON_QUERY)
parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
return { return {
'_type': 'playlist', '_type': 'playlist',
@ -609,7 +619,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
'url': 'https://soundcloud.com/jcv246/sets', 'url': 'https://soundcloud.com/jcv246/sets',
'info_dict': { 'info_dict': {
'id': '12982173', 'id': '12982173',
'title': 'Jordi / cv (Playlists)', 'title': 'Jordi / cv (Sets)',
}, },
'playlist_mincount': 2, 'playlist_mincount': 2,
}, { }, {
@ -636,39 +646,29 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
}] }]
_BASE_URL_MAP = { _BASE_URL_MAP = {
'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'all': 'stream/users/%s',
'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'tracks': 'users/%s/tracks',
'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'albums': 'users/%s/albums',
'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'sets': 'users/%s/playlists',
'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'reposts': 'stream/users/%s/reposts',
'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'likes': 'users/%s/likes',
'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, 'spotlight': 'users/%s/spotlight',
}
_TITLE_MAP = {
'all': 'All',
'tracks': 'Tracks',
'albums': 'Albums',
'sets': 'Playlists',
'reposts': 'Reposts',
'likes': 'Likes',
'spotlight': 'Spotlight',
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
uploader = mobj.group('user') uploader = mobj.group('user')
url = 'https://soundcloud.com/%s/' % uploader
resolv_url = self._resolv_url(url)
user = self._download_json( user = self._download_json(
resolv_url, uploader, 'Downloading user info') self._resolv_url(self._BASE_URL + uploader),
uploader, 'Downloading user info')
resource = mobj.group('rsrc') or 'all' resource = mobj.group('rsrc') or 'all'
return self._extract_playlist( return self._extract_playlist(
self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']), self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'],
'%s (%s)' % (user['username'], self._TITLE_MAP[resource])) str_or_none(user.get('id')),
'%s (%s)' % (user['username'], resource.capitalize()))
class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
@ -678,7 +678,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
'info_dict': { 'info_dict': {
'id': '286017854', 'id': '286017854',
'title': 'Track station: your-text', 'title': 'Track station: your text',
}, },
'playlist_mincount': 47, 'playlist_mincount': 47,
}] }]
@ -686,19 +686,17 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
track_name = self._match_id(url) track_name = self._match_id(url)
webpage = self._download_webpage(url, track_name) track = self._download_json(self._resolv_url(url), track_name)
track_id = self._search_regex( track_id = self._search_regex(
r'soundcloud:track-stations:(\d+)', webpage, 'track id') r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
return self._extract_playlist( return self._extract_playlist(
'%s/stations/soundcloud:track-stations:%s/tracks' self._API_V2_BASE + 'stations/%s/tracks' % track['id'],
% (self._API_V2_BASE, track_id), track_id, 'Track station: %s' % track['title'])
track_id, 'Track station: %s' % track_name)
class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
_VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
IE_NAME = 'soundcloud:playlist' IE_NAME = 'soundcloud:playlist'
_TESTS = [{ _TESTS = [{
'url': 'https://api.soundcloud.com/playlists/4110309', 'url': 'https://api.soundcloud.com/playlists/4110309',
@ -713,29 +711,22 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id') playlist_id = mobj.group('id')
base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id)
data_dict = { query = {
'client_id': self._CLIENT_ID, 'client_id': self._CLIENT_ID,
} }
token = mobj.group('token') token = mobj.group('token')
if token: if token:
data_dict['secret_token'] = token query['secret_token'] = token
data = compat_urllib_parse_urlencode(data_dict)
data = self._download_json( data = self._download_json(
base_url + data, playlist_id, 'Downloading playlist') self._API_V2_BASE + 'playlists/' + playlist_id,
playlist_id, 'Downloading playlist', query=query)
entries = self._extract_track_entries(data['tracks']) entries = self._extract_track_entries(data['tracks'], token)
return { return self.playlist_result(
'_type': 'playlist', entries, playlist_id, data.get('title'), data.get('description'))
'id': playlist_id,
'title': data.get('title'),
'description': data.get('description'),
'entries': entries,
}
class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
@ -753,18 +744,18 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
_SEARCH_KEY = 'scsearch' _SEARCH_KEY = 'scsearch'
_MAX_RESULTS_PER_PAGE = 200 _MAX_RESULTS_PER_PAGE = 200
_DEFAULT_RESULTS_PER_PAGE = 50 _DEFAULT_RESULTS_PER_PAGE = 50
_API_V2_BASE = 'https://api-v2.soundcloud.com'
def _get_collection(self, endpoint, collection_id, **query): def _get_collection(self, endpoint, collection_id, **query):
limit = min( limit = min(
query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
self._MAX_RESULTS_PER_PAGE) self._MAX_RESULTS_PER_PAGE)
query['limit'] = limit query.update({
query['client_id'] = self._CLIENT_ID 'limit': limit,
query['linked_partitioning'] = '1' 'client_id': self._CLIENT_ID,
query['offset'] = 0 'linked_partitioning': 1,
data = compat_urllib_parse_urlencode(query) 'offset': 0,
next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) })
next_url = update_url_query(self._API_V2_BASE + endpoint, query)
collected_results = 0 collected_results = 0
@ -791,5 +782,5 @@ def _get_collection(self, endpoint, collection_id, **query):
break break
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
tracks = self._get_collection('/search/tracks', query, limit=n, q=query) tracks = self._get_collection('search/tracks', query, limit=n, q=query)
return self.playlist_result(tracks, playlist_title=query) return self.playlist_result(tracks, playlist_title=query)