mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-13 18:51:03 +01:00
[ie/weibo] Fix extractor and support user extraction (#7657)
Closes #3964, Closes #4673, Closes #6979 Authored by: c-basalt
This commit is contained in:
parent
9e68747f96
commit
69b03f84f8
@ -2371,7 +2371,8 @@
|
|||||||
)
|
)
|
||||||
from .weibo import (
|
from .weibo import (
|
||||||
WeiboIE,
|
WeiboIE,
|
||||||
WeiboMobileIE
|
WeiboVideoIE,
|
||||||
|
WeiboUserIE,
|
||||||
)
|
)
|
||||||
from .weiqitv import WeiqiTVIE
|
from .weiqitv import WeiqiTVIE
|
||||||
from .weverse import (
|
from .weverse import (
|
||||||
|
@ -1,134 +1,241 @@
|
|||||||
from .common import InfoExtractor
|
|
||||||
|
|
||||||
import json
|
|
||||||
import random
|
import random
|
||||||
import re
|
import itertools
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
from ..compat import (
|
from .common import InfoExtractor
|
||||||
compat_parse_qs,
|
|
||||||
compat_str,
|
|
||||||
)
|
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
js_to_json,
|
int_or_none,
|
||||||
|
make_archive_id,
|
||||||
|
mimetype2ext,
|
||||||
|
parse_resolution,
|
||||||
|
str_or_none,
|
||||||
strip_jsonp,
|
strip_jsonp,
|
||||||
|
traverse_obj,
|
||||||
|
url_or_none,
|
||||||
urlencode_postdata,
|
urlencode_postdata,
|
||||||
|
urljoin,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class WeiboIE(InfoExtractor):
|
class WeiboBaseIE(InfoExtractor):
|
||||||
_VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)'
|
def _update_visitor_cookies(self, video_id):
|
||||||
_TEST = {
|
visitor_data = self._download_json(
|
||||||
'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment',
|
'https://passport.weibo.com/visitor/genvisitor', video_id,
|
||||||
'info_dict': {
|
note='Generating first-visit guest request',
|
||||||
'id': 'Fp6RGfbff',
|
transform_source=strip_jsonp,
|
||||||
'ext': 'mp4',
|
data=urlencode_postdata({
|
||||||
'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博',
|
'cb': 'gen_callback',
|
||||||
}
|
'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}',
|
||||||
}
|
}))
|
||||||
|
|
||||||
def _real_extract(self, url):
|
self._download_webpage(
|
||||||
video_id = self._match_id(url)
|
'https://passport.weibo.com/visitor/visitor', video_id,
|
||||||
# to get Referer url for genvisitor
|
note='Running first-visit callback to get guest cookies',
|
||||||
webpage, urlh = self._download_webpage_handle(url, video_id)
|
query={
|
||||||
|
'a': 'incarnate',
|
||||||
visitor_url = urlh.url
|
't': visitor_data['data']['tid'],
|
||||||
|
'w': 2,
|
||||||
if 'passport.weibo.com' in visitor_url:
|
'c': '%03d' % visitor_data['data']['confidence'],
|
||||||
# first visit
|
'cb': 'cross_domain',
|
||||||
visitor_data = self._download_json(
|
'from': 'weibo',
|
||||||
'https://passport.weibo.com/visitor/genvisitor', video_id,
|
'_rand': random.random(),
|
||||||
note='Generating first-visit data',
|
|
||||||
transform_source=strip_jsonp,
|
|
||||||
headers={'Referer': visitor_url},
|
|
||||||
data=urlencode_postdata({
|
|
||||||
'cb': 'gen_callback',
|
|
||||||
'fp': json.dumps({
|
|
||||||
'os': '2',
|
|
||||||
'browser': 'Gecko57,0,0,0',
|
|
||||||
'fonts': 'undefined',
|
|
||||||
'screenInfo': '1440*900*24',
|
|
||||||
'plugins': '',
|
|
||||||
}),
|
|
||||||
}))
|
|
||||||
|
|
||||||
tid = visitor_data['data']['tid']
|
|
||||||
cnfd = '%03d' % visitor_data['data']['confidence']
|
|
||||||
|
|
||||||
self._download_webpage(
|
|
||||||
'https://passport.weibo.com/visitor/visitor', video_id,
|
|
||||||
note='Running first-visit callback',
|
|
||||||
query={
|
|
||||||
'a': 'incarnate',
|
|
||||||
't': tid,
|
|
||||||
'w': 2,
|
|
||||||
'c': cnfd,
|
|
||||||
'cb': 'cross_domain',
|
|
||||||
'from': 'weibo',
|
|
||||||
'_rand': random.random(),
|
|
||||||
})
|
|
||||||
|
|
||||||
webpage = self._download_webpage(
|
|
||||||
url, video_id, note='Revisiting webpage')
|
|
||||||
|
|
||||||
title = self._html_extract_title(webpage)
|
|
||||||
|
|
||||||
video_formats = compat_parse_qs(self._search_regex(
|
|
||||||
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
|
|
||||||
|
|
||||||
formats = []
|
|
||||||
supported_resolutions = (480, 720)
|
|
||||||
for res in supported_resolutions:
|
|
||||||
vid_urls = video_formats.get(compat_str(res))
|
|
||||||
if not vid_urls or not isinstance(vid_urls, list):
|
|
||||||
continue
|
|
||||||
|
|
||||||
vid_url = vid_urls[0]
|
|
||||||
formats.append({
|
|
||||||
'url': vid_url,
|
|
||||||
'height': res,
|
|
||||||
})
|
})
|
||||||
|
|
||||||
uploader = self._og_search_property(
|
def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
|
||||||
'nick-name', webpage, 'uploader', default=None)
|
webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
|
||||||
|
if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
|
||||||
|
self._update_visitor_cookies(video_id)
|
||||||
|
webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
|
||||||
|
return self._parse_json(webpage, video_id, fatal=fatal)
|
||||||
|
|
||||||
|
def _extract_formats(self, video_info):
|
||||||
|
media_info = traverse_obj(video_info, ('page_info', 'media_info'))
|
||||||
|
formats = traverse_obj(media_info, (
|
||||||
|
'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
|
||||||
|
'url': 'url',
|
||||||
|
'format': ('quality_desc', {str}),
|
||||||
|
'format_id': ('label', {str}),
|
||||||
|
'ext': ('mime', {mimetype2ext}),
|
||||||
|
'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
|
||||||
|
'vcodec': ('video_codecs', {str}),
|
||||||
|
'fps': ('fps', {int_or_none}),
|
||||||
|
'width': ('width', {int_or_none}),
|
||||||
|
'height': ('height', {int_or_none}),
|
||||||
|
'filesize': ('size', {int_or_none}),
|
||||||
|
'acodec': ('audio_codecs', {str}),
|
||||||
|
'asr': ('audio_sample_rate', {int_or_none}),
|
||||||
|
'audio_channels': ('audio_channels', {int_or_none}),
|
||||||
|
}))
|
||||||
|
if not formats: # fallback, should be barely used
|
||||||
|
for url in set(traverse_obj(media_info, (..., {url_or_none}))):
|
||||||
|
if 'label=' in url: # filter out non-video urls
|
||||||
|
format_id, resolution = self._search_regex(
|
||||||
|
r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
|
||||||
|
group=(1, 2), default=(None, None))
|
||||||
|
formats.append({
|
||||||
|
'url': url,
|
||||||
|
'format_id': format_id,
|
||||||
|
**parse_resolution(resolution),
|
||||||
|
**traverse_obj(media_info, (
|
||||||
|
'video_details', lambda _, v: v['label'].startswith(format_id), {
|
||||||
|
'size': ('size', {int_or_none}),
|
||||||
|
'tbr': ('bitrate', {int_or_none}),
|
||||||
|
}
|
||||||
|
), get_all=False),
|
||||||
|
})
|
||||||
|
return formats
|
||||||
|
|
||||||
|
def _parse_video_info(self, video_info, video_id=None):
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
'title': title,
|
'extractor_key': WeiboIE.ie_key(),
|
||||||
'uploader': uploader,
|
'extractor': WeiboIE.IE_NAME,
|
||||||
'formats': formats
|
'formats': self._extract_formats(video_info),
|
||||||
|
'http_headers': {'Referer': 'https://weibo.com/'},
|
||||||
|
'_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
|
||||||
|
**traverse_obj(video_info, {
|
||||||
|
'id': (('id', 'id_str', 'mid'), {str_or_none}),
|
||||||
|
'display_id': ('mblogid', {str_or_none}),
|
||||||
|
'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
|
||||||
|
'description': ('text_raw', {str}),
|
||||||
|
'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
|
||||||
|
'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
|
||||||
|
'thumbnail': ('page_info', 'page_pic', {url_or_none}),
|
||||||
|
'uploader': ('user', 'screen_name', {str}),
|
||||||
|
'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
|
||||||
|
'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
|
||||||
|
'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
|
||||||
|
'like_count': ('attitudes_count', {int_or_none}),
|
||||||
|
'repost_count': ('reposts_count', {int_or_none}),
|
||||||
|
}, get_all=False),
|
||||||
|
'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class WeiboMobileIE(InfoExtractor):
|
class WeiboIE(WeiboBaseIE):
|
||||||
_VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?'
|
_VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
|
||||||
_TEST = {
|
_TESTS = [{
|
||||||
'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0',
|
'url': 'https://weibo.com/7827771738/N4xlMvjhI',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '4910815147462302',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'display_id': 'N4xlMvjhI',
|
||||||
|
'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
|
||||||
|
'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
|
||||||
|
'duration': 918,
|
||||||
|
'timestamp': 1686312819,
|
||||||
|
'upload_date': '20230609',
|
||||||
|
'thumbnail': r're:https://.*\.jpg',
|
||||||
|
'uploader': '睡前视频基地',
|
||||||
|
'uploader_id': '7827771738',
|
||||||
|
'uploader_url': 'https://weibo.com/u/7827771738',
|
||||||
|
'view_count': int,
|
||||||
|
'like_count': int,
|
||||||
|
'repost_count': int,
|
||||||
|
'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
'url': 'https://m.weibo.cn/status/4189191225395228',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '4189191225395228',
|
'id': '4189191225395228',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': '午睡当然是要甜甜蜜蜜的啦',
|
'display_id': 'FBqgOmDxO',
|
||||||
'uploader': '柴犬柴犬'
|
'title': '柴犬柴犬的秒拍视频',
|
||||||
|
'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
|
||||||
|
'duration': 53,
|
||||||
|
'timestamp': 1514264429,
|
||||||
|
'upload_date': '20171226',
|
||||||
|
'thumbnail': r're:https://.*\.jpg',
|
||||||
|
'uploader': '柴犬柴犬',
|
||||||
|
'uploader_id': '5926682210',
|
||||||
|
'uploader_url': 'https://weibo.com/u/5926682210',
|
||||||
|
'view_count': int,
|
||||||
|
'like_count': int,
|
||||||
|
'repost_count': int,
|
||||||
}
|
}
|
||||||
}
|
}, {
|
||||||
|
'url': 'https://weibo.com/0/4224132150961381',
|
||||||
|
'note': 'no playback_list example',
|
||||||
|
'only_matching': True,
|
||||||
|
}]
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
# to get Referer url for genvisitor
|
|
||||||
webpage = self._download_webpage(url, video_id, note='visit the page')
|
|
||||||
|
|
||||||
weibo_info = self._parse_json(self._search_regex(
|
return self._parse_video_info(self._weibo_download_json(
|
||||||
r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};',
|
f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
|
||||||
webpage, 'js_code', flags=re.DOTALL),
|
|
||||||
video_id, transform_source=js_to_json)
|
|
||||||
|
|
||||||
status_data = weibo_info.get('status', {})
|
|
||||||
page_info = status_data.get('page_info')
|
|
||||||
title = status_data['status_title']
|
|
||||||
uploader = status_data.get('user', {}).get('screen_name')
|
|
||||||
|
|
||||||
return {
|
class WeiboVideoIE(WeiboBaseIE):
|
||||||
'id': video_id,
|
_VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
|
||||||
'title': title,
|
_TESTS = [{
|
||||||
'uploader': uploader,
|
'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
|
||||||
'url': page_info['media_info']['stream_url']
|
'info_dict': {
|
||||||
|
'id': '4797700463137878',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'display_id': 'LEZDodaiW',
|
||||||
|
'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了',
|
||||||
|
'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ',
|
||||||
|
'duration': 76,
|
||||||
|
'timestamp': 1659344278,
|
||||||
|
'upload_date': '20220801',
|
||||||
|
'thumbnail': r're:https://.*\.jpg',
|
||||||
|
'uploader': '君子爱财陈平安',
|
||||||
|
'uploader_id': '3905382233',
|
||||||
|
'uploader_url': 'https://weibo.com/u/3905382233',
|
||||||
|
'view_count': int,
|
||||||
|
'like_count': int,
|
||||||
|
'repost_count': int,
|
||||||
}
|
}
|
||||||
|
}]
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
video_id = self._match_id(url)
|
||||||
|
|
||||||
|
post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
|
||||||
|
video_info = self._weibo_download_json(
|
||||||
|
f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
|
||||||
|
video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
|
||||||
|
return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
|
||||||
|
|
||||||
|
|
||||||
|
class WeiboUserIE(WeiboBaseIE):
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
|
||||||
|
_TESTS = [{
|
||||||
|
'url': 'https://weibo.com/u/2066652961?tabtype=video',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '2066652961',
|
||||||
|
'title': '萧影殿下的视频',
|
||||||
|
'description': '萧影殿下的全部视频',
|
||||||
|
'uploader': '萧影殿下',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 195,
|
||||||
|
}]
|
||||||
|
|
||||||
|
def _fetch_page(self, uid, cursor=0, page=1):
|
||||||
|
return self._weibo_download_json(
|
||||||
|
'https://weibo.com/ajax/profile/getWaterFallContent',
|
||||||
|
uid, note=f'Downloading videos page {page}',
|
||||||
|
query={'uid': uid, 'cursor': cursor})['data']
|
||||||
|
|
||||||
|
def _entries(self, uid, first_page):
|
||||||
|
cursor = 0
|
||||||
|
for page in itertools.count(1):
|
||||||
|
response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
|
||||||
|
for video_info in traverse_obj(response, ('list', ..., {dict})):
|
||||||
|
yield self._parse_video_info(video_info)
|
||||||
|
cursor = response.get('next_cursor')
|
||||||
|
if (int_or_none(cursor) or -1) < 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
uid = self._match_id(url)
|
||||||
|
first_page = self._fetch_page(uid)
|
||||||
|
uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
|
||||||
|
metainfo = {
|
||||||
|
'title': f'{uploader}的视频',
|
||||||
|
'description': f'{uploader}的全部视频',
|
||||||
|
'uploader': uploader,
|
||||||
|
} if uploader else {}
|
||||||
|
|
||||||
|
return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)
|
||||||
|
Loading…
Reference in New Issue
Block a user